#İkinci El Araç Tahmin uYGULAMASI

In [65]:
import os
import numpy as np
import pandas as pd

In [66]:
# Load the dataset 
df = pd.read_csv("data/arabamcom_merged.csv")

In [67]:
# Display the columns names
print(df.columns.tolist())

['smallest-text-minus href', 'listing-image src', 'listing-text-new', 'listing-text-new 2', 'fade-out-content-wrapper', 'fade-out-content-wrapper href 2', 'db', 'fade-out-content-wrapper 2', 'fade-out-content-wrapper 3', 'fade-out-content-wrapper 4']


In [68]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1565 entries, 0 to 1564
Data columns (total 10 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   smallest-text-minus href         1565 non-null   object
 1   listing-image src                1565 non-null   object
 2   listing-text-new                 1565 non-null   object
 3   listing-text-new 2               1565 non-null   object
 4   fade-out-content-wrapper         1565 non-null   int64 
 5   fade-out-content-wrapper href 2  1441 non-null   object
 6   db                               1565 non-null   object
 7   fade-out-content-wrapper 2       1180 non-null   object
 8   fade-out-content-wrapper 3       1562 non-null   object
 9   fade-out-content-wrapper 4       1562 non-null   object
dtypes: int64(1), object(9)
memory usage: 122.4+ KB
None


In [69]:
print(df.columns)

Index(['smallest-text-minus href', 'listing-image src', 'listing-text-new',
       'listing-text-new 2', 'fade-out-content-wrapper',
       'fade-out-content-wrapper href 2', 'db', 'fade-out-content-wrapper 2',
       'fade-out-content-wrapper 3', 'fade-out-content-wrapper 4'],
      dtype='object')


In [70]:
#Drop unwanted columns for better readability
df.drop(columns=[
    'listing-image src',
    'fade-out-content-wrapper href 2',
    'fade-out-content-wrapper 2'
], inplace=True, errors='ignore')

# Rename columns for better readability
df.rename(columns={
    'smallest-text-minus href': 'ilan_linki',
    'listing-text-new': 'model',
    'listing-text-new 2': 'model_detay',
    'fade-out-content-wrapper': 'model_yili',
    'db': 'fiyat',
    'fade-out-content-wrapper 3': 'sehir',
    'fade-out-content-wrapper 4': 'ilce'
}, inplace=True)


In [71]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1565 entries, 0 to 1564
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ilan_linki   1565 non-null   object
 1   model        1565 non-null   object
 2   model_detay  1565 non-null   object
 3   model_yili   1565 non-null   int64 
 4   fiyat        1565 non-null   object
 5   sehir        1562 non-null   object
 6   ilce         1562 non-null   object
dtypes: int64(1), object(6)
memory usage: 85.7+ KB
None


In [72]:
print(df["fiyat"].unique()[:10])  # ilk 10 farklı değeri göster

['1.350.000 TL' '1.025.000 TL' '1.910.900 TL' '1.119.900 TL'
 '1.179.900 TL' '439.000 TL' '2.100.000 TL' '2.250.000 TL' '1.850.000 TL'
 '2.799.000 TL']


In [73]:
# Remove TL symbols, non-numeric characters and spaces, then convert to numeric
df["fiyat"] = df["fiyat"].astype(str).str.replace("[^0-9]", "", regex=True)
df["fiyat"] = pd.to_numeric(df["fiyat"], errors="coerce")

In [74]:
print(df["fiyat"].head(10))
print(df["fiyat"].dtype)
print(df["fiyat"].isnull().sum())  # kaç tanesi NaN olmuş

0    1350000
1    1025000
2    1910900
3    1119900
4    1179900
5     439000
6    2100000
7    2250000
8    1850000
9    2799000
Name: fiyat, dtype: int64
int64
0


In [75]:
def parse_ilan_link(link):
    """
    Parses the given listing link and extracts vehicle information.

    Parameters:
        link (str): The listing URL string containing vehicle details.

    Returns:
        dict: A dictionary with extracted fields such as 'ilan_turu', 'tip', 'marka', 'model',
              'motor_hacmi', 'motor_tipi', 'donanim', 'donanim_detay', 'model_yili', 'yakit', 'vites'.
    """
    
    try:
        if not isinstance(link, str):
            return {}

        parts = link.split("/")
        if len(parts) < 5:
            return {}

        segment = parts[4]
        pieces = segment.split("-")

        return {
            "ilan_turu": pieces[0] if len(pieces) > 0 else None,
            "tip": pieces[1] if len(pieces) > 1 else None,
            "marka": pieces[2] if len(pieces) > 2 else None,
            "model": pieces[3] if len(pieces) > 3 else None,
            "motor_hacmi": ".".join(pieces[4:6]) if len(pieces) > 5 else None,
            "motor_tipi": pieces[6] if len(pieces) > 6 else None,
            "donanim": pieces[7] if len(pieces) > 7 else None,
            "donanim_detay": pieces[8] if len(pieces) > 8 else None,
            "model_yili": pieces[9] if len(pieces) > 9 else None,
            "yakit": pieces[10] if len(pieces) > 10 else None,
            "vites": pieces[11] if len(pieces) > 11 else None
        }
    except Exception as e:
        print("Hata:", e)
        return {}


In [76]:
# Apply the parsing function to the 'parse_ilan_link' column and expand the results into seperate columns 
parsed_df = df["ilan_linki"].apply(parse_ilan_link).apply(pd.Series)
df = pd.concat([df, parsed_df], axis=1)
# Remove the duplicate columns that may have been created during concenation
df = df.loc[:, ~df.columns.duplicated()]

In [77]:
print(df[["marka", "model", "motor_hacmi", "vites"]].sample(5))


        marka                                  model motor_hacmi vites
1503     opel           Opel Corsa 1.3 CDTI Essentia         1.3  None
675      ford  Ford Ranger 2.0 EcoBlue 4x4 Wild Trak         2.0  None
1415  renault             Renault Clio 1.6 Dynamique         1.6  None
349      fiat            Fiat Egea 1.3 Multijet Easy         1.3  None
1386    honda    Honda Civic 1.6 i-VTEC ECO Elegance         1.6  None


In [78]:
print(df.columns)

Index(['ilan_linki', 'model', 'model_detay', 'model_yili', 'fiyat', 'sehir',
       'ilce', 'ilan_turu', 'tip', 'marka', 'motor_hacmi', 'motor_tipi',
       'donanim', 'donanim_detay', 'yakit', 'vites'],
      dtype='object')


In [79]:
print(df['marka'].unique())
print(df['model_yili'].unique())
print(df['vites']
)

['jeep' 'hyundai' 'chery' 'fiat' 'renault' 'mercedes' 'volvo' 'nissan'
 'skoda' 'peugeot' 'seat' 'audi' 'bmw' 'volkswagen' 'toyota' 'honda'
 'citroen' 'ford' 'tofas' 'opel' 'ssangyong' 'land' 'kia' 'dacia' 'iveco'
 'porsche' 'mitsubishi' 'yamaha' 'river' 'jaguar' 'kuba' 'chevrolet'
 'maserati' 'ticari' 'bajaj' 'mini' 'tvs' 'baoli' 'togg' 'karsan' 'kral'
 'skywell' 'alfa' 'car' 'mazda' 'mg' 'revolt' 'isuzu' 'smart' 'ds' 'new'
 'universal' 'yuki' 'suzuki' 'otokar' 'yiben' 'lamborghini']
[2019 2013 2024 2022 2017 2012 2021 1997 2011 2014 2015 2025 2023 2016
 2018 2003 2020 2010 1993 2009 2007 2000 2005 2006 1988 1998 1999 1992
 2004 2008 1994 2001 1991 1996 1990 2002 1995]
0       None
1       None
2       None
3       None
4       None
        ... 
1560    None
1561    None
1562    None
1563    None
1564    None
Name: vites, Length: 1565, dtype: object


In [80]:
print(df.columns.tolist())

['ilan_linki', 'model', 'model_detay', 'model_yili', 'fiyat', 'sehir', 'ilce', 'ilan_turu', 'tip', 'marka', 'motor_hacmi', 'motor_tipi', 'donanim', 'donanim_detay', 'yakit', 'vites']


In [81]:
# Print the number of missing values in each column
print(df.isnull().sum())

ilan_linki          0
model               0
model_detay         0
model_yili          0
fiyat               0
sehir               3
ilce                3
ilan_turu           0
tip                 0
marka               0
motor_hacmi        68
motor_tipi        105
donanim           354
donanim_detay    1008
yakit            1556
vites            1561
dtype: int64


In [82]:
# Drop unwanted columns
df.drop(columns=["vites", "yakit", "donanim_detay"], inplace=True)

In [83]:
# Fill missing values with "bilinmiyor"
df["motor_hacmi"].fillna("bilinmiyor", inplace=True)
df["motor_tipi"].fillna("bilinmiyor", inplace=True)
df["donanim"].fillna("bilinmiyor", inplace=True)

In [84]:
# NaN values is controlled
print(df.isnull().sum())
print(df.shape)


ilan_linki     0
model          0
model_detay    0
model_yili     0
fiyat          0
sehir          3
ilce           3
ilan_turu      0
tip            0
marka          0
motor_hacmi    0
motor_tipi     0
donanim        0
dtype: int64
(1565, 13)


In [85]:
# 1980 and above model with price <= 100000 cars selection. Other cars are dropped.
df = df[(df["model_yili"] >= 1980) & (df["fiyat"] >= 100000)]
print(df.shape)
print(df[["model_yili", "fiyat"]].describe())


(1551, 13)
        model_yili         fiyat
count  1551.000000  1.551000e+03
mean   2014.214055  1.117144e+06
std       7.178504  2.815328e+06
min    1988.000000  1.050000e+05
25%    2011.000000  5.342500e+05
50%    2015.000000  8.050000e+05
75%    2020.000000  1.227500e+06
max    2025.000000  6.200000e+07


In [86]:
print(df.columns)

Index(['ilan_linki', 'model', 'model_detay', 'model_yili', 'fiyat', 'sehir',
       'ilce', 'ilan_turu', 'tip', 'marka', 'motor_hacmi', 'motor_tipi',
       'donanim'],
      dtype='object')


In [87]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1551 entries, 0 to 1564
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ilan_linki   1551 non-null   object
 1   model        1551 non-null   object
 2   model_detay  1551 non-null   object
 3   model_yili   1551 non-null   int64 
 4   fiyat        1551 non-null   int64 
 5   sehir        1548 non-null   object
 6   ilce         1548 non-null   object
 7   ilan_turu    1551 non-null   object
 8   tip          1551 non-null   object
 9   marka        1551 non-null   object
 10  motor_hacmi  1551 non-null   object
 11  motor_tipi   1551 non-null   object
 12  donanim      1551 non-null   object
dtypes: int64(2), object(11)
memory usage: 169.6+ KB
None


In [88]:
#Drop the 'ilan_linki' column as it is no longer needed. and previously, the necessary information was taken from it
df.drop(columns=["ilan_linki"], inplace=True)

In [89]:
print(df.columns)

Index(['model', 'model_detay', 'model_yili', 'fiyat', 'sehir', 'ilce',
       'ilan_turu', 'tip', 'marka', 'motor_hacmi', 'motor_tipi', 'donanim'],
      dtype='object')


In [90]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1551 entries, 0 to 1564
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   model        1551 non-null   object
 1   model_detay  1551 non-null   object
 2   model_yili   1551 non-null   int64 
 3   fiyat        1551 non-null   int64 
 4   sehir        1548 non-null   object
 5   ilce         1548 non-null   object
 6   ilan_turu    1551 non-null   object
 7   tip          1551 non-null   object
 8   marka        1551 non-null   object
 9   motor_hacmi  1551 non-null   object
 10  motor_tipi   1551 non-null   object
 11  donanim      1551 non-null   object
dtypes: int64(2), object(10)
memory usage: 157.5+ KB
None


In [91]:
# Get most common values for 'sehir' and 'ilce' 
sehir_mode = df["sehir"].mode()[0]
ilce_mode = df["ilce"].mode()[0]

# Fill NaN values with the most common values
df["sehir"].fillna(sehir_mode, inplace=True)
df["ilce"].fillna(ilce_mode, inplace=True)


In [92]:
print(df[["sehir", "ilce"]].isnull().sum())

sehir    0
ilce     0
dtype: int64


In [93]:
# Display unique values for 'motor_hacmi' and check for NaN values
print(df["motor_hacmi"].dropna().unique())

['1.6' '8.pro' 'cross.1' '1.5' 'axor.1840' 'sx.2' 'e.180' '1.0' '2.0'
 'serisi.520i' '3.0' 'serisi.418i' '1.4' 'bilinmiyor' 'combi.1' '2.4'
 'variant.1' '1.8' 'elysee.1' 'sedan.1' '1.3' 'sedan.35' 'courier.1'
 '19.1' 'aircross.1' 'sportback.45' 'serisi.216d' 'city.1' 'e.200' 'e.300'
 'freelander.2' 'express.1' '3.2' '1.2' 'blue.1' '1.33' 'x.1'
 'serisi.316i' 'trail.1' 'connect.1' 'hr.1' '2.3' '72.160' 'gt3.rs'
 '300.city' '0.9' 'a.180' 'sportback.1' 'c.180' 'serisi.320i' 'cargo.2520'
 '9.1' 'evo.1' 'sedan.2' 'cargo.1' 'c.200' 'cla.200' '16d.sdrive' '2.2'
 'roc.1' '2.7' 'sportback.2' 'up.4wd' 'v.2' 'range.evoque' 'serisi.520d'
 'serisi.730d' '2.5' 'serisi.m5' 'canter.8bl' 'dorse.kuru' '1200.x'
 '200.rs' '300.s' 'transit.350' 'eqb.250' 'e.220' 'transit.14'
 'transit.16' 'panorama.1' 'countryman.1' 's.350' '1.9' 'transit.t330'
 '1.6d' '1.25' 'glk.220' '15.1' 'kr.305' '13.m3' 'serisi.320d' 'glc.350'
 'cherokee.5' 'serisi.318d' 'premium.plus' 'serisi.116i' 'era.1'
 'multix.1' 'tonale.1' 'a.

In [None]:
# Show the most common 'motor hacmi' values with frequency
print(df["motor_hacmi"].value_counts(dropna=False).head(30))  # first 30 value

motor_hacmi
1.6            302
1.5            198
1.4            150
1.3            106
1.2             89
combi.1         61
bilinmiyor      55
2.0             50
1.0             48
cross.1         21
courier.1       17
elysee.1        16
connect.1       16
sedan.2         15
sedan.1         13
serisi.320i     13
cargo.1         13
1.9             12
1.8             12
serisi.520i     11
c.180           11
a.180           10
e.220           10
3.0              9
s.350            9
aircross.1       9
blue.1           8
panorama.1       8
serisi.316i      7
0.9              7
Name: count, dtype: int64


In [95]:
pd.set_option("display.max_columns", None)  # Show all columns
print(df.sample(50))


                                                 model  \
987                          Toyota Auris 1.33 Comfort   
902                     Audi A3 Sedan 1.6 TDI Ambiente   
781              Ticari Araçlar Dorse Kuru Yük Kapaklı   
465                   Peugeot 308 1.6 THP Feeline Plus   
557          Citroen C5 Aircross 1.5 BlueHDI Feel Bold   
1232                      Renault Megane 1.5 dCi Touch   
123      Opel Insignia 1.6 CDTI Grand Sport Excellence   
1050                  Renault Duster 1.6 E-Tech Techno   
312                         Renault Megane 1.6 Extreme   
793                          BMW 3 Serisi 316i M Sport   
1156                 Volkswagen Caddy 1.6 TDI Maxi Van   
1215                       Fiat Egea 1.3 Multijet Easy   
142                            Seat Leon 1.6 TDI Style   
141   Mercedes - Benz C 180 BlueEFFICIENCY Fascination   
434       Fiat Doblo Panorama 1.6 Multijet Premio Plus   
86                     Citroen C4 X 1.2 PureTech Shine   
1383          

In [96]:
df.to_csv("temizlenmis_arac_verisi.csv", index=False)