In [2]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
import numpy as np

### Importing the Data

The data is comma seperated, and the raw data being read from absolulute directory /data/raw/

In [63]:
df = pd.read_csv("../data/raw/autoscout24-germany-dataset.csv")

display(df)

Unnamed: 0,mileage,make,model,fuel,gear,offerType,price,hp,year
0,235000,BMW,316,Diesel,Manual,Used,6800,116.0,2011
1,92800,Volkswagen,Golf,Gasoline,Manual,Used,6877,122.0,2011
2,149300,SEAT,Exeo,Gasoline,Manual,Used,6900,160.0,2011
3,96200,Renault,Megane,Gasoline,Manual,Used,6950,110.0,2011
4,156000,Peugeot,308,Gasoline,Manual,Used,6950,156.0,2011
...,...,...,...,...,...,...,...,...,...
46400,99,Fiat,500,Electric/Gasoline,Manual,Pre-registered,12990,71.0,2021
46401,99,Fiat,500,Electric/Gasoline,Manual,Pre-registered,12990,71.0,2021
46402,99,Fiat,500,Electric/Gasoline,Manual,Pre-registered,12990,71.0,2021
46403,99,Fiat,500,Electric/Gasoline,Manual,Pre-registered,12990,71.0,2021


### Cleaning Data

The data is 99.3% complete, but unfortunately there are 334 rows (out of 46405 rows) that contain incomplete information about the vehicle.
- Mileage, make, fuel, offerType, price, and year contain complete information.
- 143 rows contain missing vehicle model, for these cases I am going to replace the blank space with "Unknown" since the data associated with the vehicle is still valuable even if the model is unkown.
- 182 rows contain missing transmission information, these will be dropped since this column will be converted to a numeric later.
- 29 vehicles are missing their horsepower, I am going to delete these few rows since they are inconsequencial compared to the size of the data set and provide little use with such an impactful metric missing
- There are 23 vehicles with brand "Others", these vehicles provide no insight into make/model or any other correlations that can be drawn thus they are dropped from the dataframe.
- In between each of the row drop operations, the row index's are re-indexed from 0 for the next row drop operation.
- I find "gear" a little odd, I prefer to use "transmission" so I am going to rename the column.

In [88]:
df = pd.read_csv("../data/raw/autoscout24-germany-dataset.csv")

df["model"] = df["model"].fillna("Unknown")

df = df.drop(df[df["gear"].isna()].index)

df = df.reset_index()
df = df.drop("index", axis = "columns")

df = df.drop(df[df["hp"].isna()].index)

df = df.reset_index()
df = df.drop("index", axis = "columns")

df = df.drop(df[df["make"] == "Others"].index)

df = df.reset_index()
df = df.drop("index", axis = "columns")

df["transmission"] = df["gear"]
df = df.drop("gear", axis = "columns")

df = df[["mileage", "make", "model", "fuel", "transmission", "offerType", "price", "hp", "year"]]

display(df)

Unnamed: 0,mileage,make,model,fuel,transmission,offerType,price,hp,year
0,235000,BMW,316,Diesel,Manual,Used,6800,116.0,2011
1,92800,Volkswagen,Golf,Gasoline,Manual,Used,6877,122.0,2011
2,149300,SEAT,Exeo,Gasoline,Manual,Used,6900,160.0,2011
3,96200,Renault,Megane,Gasoline,Manual,Used,6950,110.0,2011
4,156000,Peugeot,308,Gasoline,Manual,Used,6950,156.0,2011
...,...,...,...,...,...,...,...,...,...
46175,99,Fiat,500,Electric/Gasoline,Manual,Pre-registered,12990,71.0,2021
46176,99,Fiat,500,Electric/Gasoline,Manual,Pre-registered,12990,71.0,2021
46177,99,Fiat,500,Electric/Gasoline,Manual,Pre-registered,12990,71.0,2021
46178,99,Fiat,500,Electric/Gasoline,Manual,Pre-registered,12990,71.0,2021


### Data

This section is the same content as the previous, except all of the methods called are wrapped into one method chain.

In [14]:
def load_and_process(path):
    df = (
        pd.read_csv(path)
        .pipe(lambda x: x.drop(x.loc[lambda row : row["gear"].isna()].index))
        .reset_index(drop = "true")
        .pipe(lambda x: x.drop(x.loc[lambda row : row["hp"].isna()].index))
        .reset_index(drop = "true")
        .pipe(lambda x: x.drop(x.loc[lambda row : row["make"] == "Others"].index))
        .reset_index(drop = "true")
        .assign(transmission = lambda x : x.gear)
        .drop("gear", axis = "columns")
        .reindex(columns = ["mileage", "fuel", "model", "fuel", "transmission", "offerType", "price", "hp", "year"])
    )
    return df
df = load_and_process("../data/raw/autoscout24-germany-dataset.csv")

Unnamed: 0,mileage,fuel,model,fuel.1,transmission,offerType,price,hp,year
0,235000,Diesel,316,Diesel,Manual,Used,6800,116.0,2011
1,92800,Gasoline,Golf,Gasoline,Manual,Used,6877,122.0,2011
2,149300,Gasoline,Exeo,Gasoline,Manual,Used,6900,160.0,2011
3,96200,Gasoline,Megane,Gasoline,Manual,Used,6950,110.0,2011
4,156000,Gasoline,308,Gasoline,Manual,Used,6950,156.0,2011
...,...,...,...,...,...,...,...,...,...
46175,99,Electric/Gasoline,500,Electric/Gasoline,Manual,Pre-registered,12990,71.0,2021
46176,99,Electric/Gasoline,500,Electric/Gasoline,Manual,Pre-registered,12990,71.0,2021
46177,99,Electric/Gasoline,500,Electric/Gasoline,Manual,Pre-registered,12990,71.0,2021
46178,99,Electric/Gasoline,500,Electric/Gasoline,Manual,Pre-registered,12990,71.0,2021
