## Installing the necessary packages

In [2]:
import numpy as np
import pandas as pd

In [3]:
def drop_columns(df, cols):
    """
    This function gets df and cols parameters and drops the cols from the given df.
    
    Parameters:
    df (pandas.DataFrame): DataFrame which columns will be deleted.
    cols (list): A list of the column or columns to be dropped.
    
    Return:
    None
    
    Example:
    >>> drop_columns(df_name, list_of_columns_to_be_dropped)
    None
    """
    try:
        for col in cols:
            df.drop(col, axis=1, inplace=True)
    except Exception as e:
        print(f"An error occured {e}")

In [4]:
def determine_fuel_type(model):
    """
    This function gets model as parameter and determines the fuel type of the BMW 3 Seire.
    
    Parameters:
    model (str): Model which fuel type will be determined.
    
    Return:
    str: "diesel" or "gasoline"
    
    Example:
    >>> determine_fuel_type(model_name)
    "gasoline" or "diesel"
    """
    if model.endswith('d'):
        return 'diesel'
    else:
        return 'gasoline'

## Conversion of data set to DataFrame and overview

In [3]:
df = pd.read_csv("arabam_bmw3.csv")

In [4]:
df

Unnamed: 0,smallest-text-minus href,listing-image src,listing-text-new,listing-text-new 2,fade-out-content-wrapper,fade-out-content-wrapper 2,fade-out-content-wrapper 3,fade-out-content-wrapper href 4,db,fade-out-content-wrapper 4,fade-out-content-wrapper 5,fade-out-content-wrapper 6
0,https://www.arabam.com/ilan/galeriden-satilik-...,https://arbstorage.mncdn.com/ilanfotograflari/...,BMW 3 Serisi 330d Standart,Galeriden BMW 3 Serisi 320 d ışık paket 2007 M...,2007,349.0,Siyah,https://www.arabam.com/ilan/galeriden-satilik-...,750.000 TL,27 Temmuz 2024,Adana,Sarıçam
1,https://www.arabam.com/ilan/galeriden-satilik-...,https://arbstorage.mncdn.com/ilanfotograflari/...,BMW 3 Serisi 316i Modern Line,Galeriden BMW 3 Serisi 316i Modern Line 2012 M...,2012,83.5,Beyaz,https://www.arabam.com/ilan/galeriden-satilik-...,1.120.000 TL,26 Temmuz 2024,Nevşehir,Gülşehir
2,https://www.arabam.com/ilan/galeriden-satilik-...,https://arbstorage.mncdn.com/ilanfotograflari/...,BMW 3 Serisi 316i Advantage,Galeriden BMW 3 Serisi 316i Advantage 2011 Mod...,2011,146.0,Beyaz,https://www.arabam.com/ilan/galeriden-satilik-...,812.000 TL,24 Temmuz 2024,Isparta,Merkez
3,https://www.arabam.com/ilan/galeriden-satilik-...,https://arbstorage.mncdn.com/ilanfotograflari/...,BMW 3 Serisi 320i Coupe,Galeriden BMW 3 Serisi 320i Coupe 2010 Model K...,2010,153.0,Beyaz,https://www.arabam.com/ilan/galeriden-satilik-...,1.185.000 TL,24 Temmuz 2024,Konya,Akşehir
4,https://www.arabam.com/ilan/galeriden-satilik-...,https://arbstorage.mncdn.com/ilanfotograflari/...,BMW 3 Serisi 320i ED Luxury Line Plus,2014 MODEL 320İED LUXURY LINE PLUS 170 HP 130....,2014,130.0,Füme,https://www.arabam.com/ilan/galeriden-satilik-...,1.195.000 TL,23 Temmuz 2024,Hatay,İskenderun
...,...,...,...,...,...,...,...,...,...,...,...,...
994,https://www.arabam.com/ilan/sahibinden-satilik...,https://arbimg1.mncdn.com/ilanfotograflari/noI...,BMW 3 Serisi 318i Standart,Sahibinden BMW 3 Serisi 318i Standart 1993 Model,1993,372.0,Lacivert,https://www.arabam.com/ilan/sahibinden-satilik...,200.000 TL,28 Haziran 2024,Ankara,Etimesgut
995,https://www.arabam.com/ilan/galeriden-satilik-...,https://arbimg1.mncdn.com/ilanfotograflari/noI...,BMW 3 Serisi 316i M Sport,Hatasız boyasız tramersiz,2015,196.0,Mavi,https://www.arabam.com/ilan/galeriden-satilik-...,1.275.000 TL,28 Haziran 2024,Muğla,Fethiye
996,https://www.arabam.com/ilan/galeriden-satilik-...,https://arbimg1.mncdn.com/ilanfotograflari/noI...,BMW 3 Serisi 318i Prestige,"""Mec Motors"" 130.000km Hatasız-Boyasız Isıtma-...",2017,130.0,Siyah,https://www.arabam.com/ilan/galeriden-satilik-...,1.245.000 TL,28 Haziran 2024,Balıkesir,Gönen
997,https://www.arabam.com/ilan/galeriden-satilik-...,https://arbimg1.mncdn.com/ilanfotograflari/noI...,BMW 3 Serisi 320d M Sport,AES AUTO'DAN 2012 BMW 320 D DIŞ M SPORT,2012,286.0,Siyah,https://www.arabam.com/ilan/galeriden-satilik-...,1.050.750 TL,28 Haziran 2024,Denizli,Merkezefendi


## Removing unnecessary columns from the DataFrame

In [5]:
columns_to_be_dropped = [
    "listing-text-new 2",
    "listing-image src", 
    "fade-out-content-wrapper href 4", 
    "fade-out-content-wrapper 4", 
    "smallest-text-minus href"
]

In [6]:
drop_columns(df, columns_to_be_dropped)

## Converting column names to a more understandable name

In [7]:
df["district"] = df["fade-out-content-wrapper 6"].astype(str)
drop_columns(df, ["fade-out-content-wrapper 6"])

In [8]:
df["city"] = df["fade-out-content-wrapper 5"].astype(str)
drop_columns(df, ["fade-out-content-wrapper 5"])

## Making necessary corrections and cleaning

In [9]:
df["db"] = df["db"].astype(str).apply(lambda x: x.replace(".", ""))
df["price"] = df["db"].astype(str).apply(lambda x: x.split(" ")[0]).astype(int)

In [10]:
drop_columns(df, ["db"])

In [12]:
df["fade-out-content-wrapper 3"].unique()

array(['Siyah', 'Beyaz', 'Füme', 'Gri (titanyum)', 'Kırmızı',
       'Kahverengi', 'Gri (Gümüş)', 'Mavi (metalik)', 'Gri (metalik)',
       'Gri', 'Lacivert', 'Bordo', 'Yeşil', 'Mavi', 'Diğer', 'Turuncu',
       'Yeşil (metalik)', 'Mor', 'Sarı', 'Şampanya'], dtype=object)

In [13]:
replace_dict = {
    "Gri (titanyum)": "Gri",
    "Gri (Gümüş)": "Gri",
    "Gri (metalik)": "Gri",
    "Mavi (metalik)": "Mavi",
    "Diğer": "Beyaz",
    "Yeşil (metalik)": "Yeşil"
}

In [14]:
df["fade-out-content-wrapper 3"] = df["fade-out-content-wrapper 3"].replace(replace_dict.keys(), replace_dict.values()) # type: ignore
df["color"] = df["fade-out-content-wrapper 3"]
drop_columns(df, ["fade-out-content-wrapper 3"])

In [15]:
df["color"].unique()

array(['Siyah', 'Beyaz', 'Füme', 'Gri', 'Kırmızı', 'Kahverengi', 'Mavi',
       'Lacivert', 'Bordo', 'Yeşil', 'Turuncu', 'Mor', 'Sarı', 'Şampanya'],
      dtype=object)

In [18]:
df["km"] = df["fade-out-content-wrapper 2"].apply(lambda x: f"{x:.3f}").apply(lambda x: x.replace(".", "")).astype(int)
drop_columns(df, ["fade-out-content-wrapper 2"])

In [21]:
df["year"] =df["fade-out-content-wrapper"]
drop_columns(df, ["fade-out-content-wrapper"])

In [24]:
df["model"] = df["listing-text-new"].apply(lambda x: x.split(" ")[3])

In [27]:
df['fuel_type'] = df['model'].apply(determine_fuel_type) # type: ignore

In [28]:
drop_columns(df, ["listing-text-new"])

In [29]:
df["fuel_type"].unique()

array(['diesel', 'gasoline'], dtype=object)

## Cleaned version of the data set

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   district   999 non-null    object
 1   city       999 non-null    object
 2   price      999 non-null    int64 
 3   color      999 non-null    object
 4   km         999 non-null    int64 
 5   year       999 non-null    int64 
 6   model      999 non-null    object
 7   fuel_type  999 non-null    object
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [31]:
print(df)

         district       city    price     color      km  year model fuel_type
0         Sarıçam      Adana   750000     Siyah  349000  2007  330d    diesel
1        Gülşehir   Nevşehir  1120000     Beyaz   83500  2012  316i  gasoline
2          Merkez    Isparta   812000     Beyaz  146000  2011  316i  gasoline
3         Akşehir      Konya  1185000     Beyaz  153000  2010  320i  gasoline
4      İskenderun      Hatay  1195000      Füme  130000  2014  320i  gasoline
..            ...        ...      ...       ...     ...   ...   ...       ...
994     Etimesgut     Ankara   200000  Lacivert  372000  1993  318i  gasoline
995       Fethiye      Muğla  1275000      Mavi  196000  2015  316i  gasoline
996         Gönen  Balıkesir  1245000     Siyah  130000  2017  318i  gasoline
997  Merkezefendi    Denizli  1050750     Siyah  286000  2012  320d    diesel
998     Etimesgut     Ankara   975000     Siyah  203000  2009  320i  gasoline

[999 rows x 8 columns]


## Saving cleaned data

In [237]:
df.to_csv("data.csv", index=False)