In [37]:
import pandas as pd
import numpy as np
import os
import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.DataFrame()
for tmp in os.listdir("bases/car-db/"):
    tmpDF = pd.read_csv(f"bases/car-db/{tmp}")
    tmpDF['car'] = tmp.split(".")[0]
    df = df.append(tmpDF)
    print(f"{tmp} - Nulls: {df.isna().sum().sum()}")
df.sample(5)

audi.csv - Nulls: 0
ford.csv - Nulls: 0
merc.csv - Nulls: 0
skoda.csv - Nulls: 0
toyota.csv - Nulls: 0
vauxhall.csv - Nulls: 0
vw.csv - Nulls: 0


Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,car
16427,Mondeo,2015,10995,Manual,66509,Diesel,125,58.9,2.0,ford
573,A1,2016,13995,Manual,8577,Petrol,30,55.4,1.4,audi
6718,T-Cross,2019,18225,Manual,6000,Petrol,145,47.9,1.0,vw
7352,Polo,2019,12995,Manual,11756,Petrol,145,48.7,1.0,vw
12354,Focus,2019,17991,Manual,1232,Diesel,150,61.4,1.5,ford


In [3]:
target_name = 'price'

# Categories

In [4]:
categories_cols = df.columns[df.dtypes == object]
df[categories_cols].nunique().sort_values()

transmission      4
fuelType          5
car               7
model           155
dtype: int64

In [5]:
# Model high Cardinality
df.drop('model',axis=1,inplace=True)

# Numeric

In [6]:
df.quantile([0,0.01,0.05,0.5,0.95,0.99,1]).T

Unnamed: 0,0.00,0.01,0.05,0.50,0.95,0.99,1.00
year,1970.0,2010.0,2014.0,2017.0,2019.0,2020.0,2060.0
price,450.0,3990.0,6490.0,13990.0,32995.0,49990.0,159999.0
mileage,1.0,10.0,1000.0,17384.0,63100.0,96000.0,323000.0
tax,0.0,0.0,0.0,145.0,200.0,260.0,580.0
mpg,0.3,30.1,36.7,54.3,74.3,83.1,235.4
engineSize,0.0,1.0,1.0,1.5,2.4,3.0,6.3


In [50]:
import pandas_profiling

report = pandas_profiling.ProfileReport(df, title='Profile Report', html={'style':{'full_width':True}})

In [53]:
report.to_file("carAnalysis.html")

HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




In [7]:
# Year Outlier
df = df[(df.year < 2022) & (df.year > 2009)]

# Modelling

In [26]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split

In [27]:
df = pd.get_dummies(df,drop_first=True)

In [28]:
df.reset_index(drop=True,inplace=True)
X = df.drop('price',axis=1)
y = df['price']

In [29]:
scaler = StandardScaler()
# same scale
X = pd.DataFrame(scaler.fit_transform(X),columns=X.columns)

MMscaler = MinMaxScaler()
for col in X.columns[X.nunique() == 2]:
    X[col] = MMscaler.fit_transform(X[[col]]).astype(int)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [44]:
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

In [41]:
from sklearn.model_selection import cross_validate, KFold, GridSearchCV

# Models

In [49]:
models = [
    {
        'model_name':'ElasticNet',
        'params':{
            'alpha':[0,0.1,0.5,0.9,1],
            'l1_ratio':[0,0.1,0.5,0.9,1],
        },
        'model':ElasticNet(),
    },
    {
        'model_name':'DecisionTreeRegressor',
        'params':{
            'max_depth':[3,10,40,None],
            'min_samples_split':[2,50,300]
        },
        'model':DecisionTreeRegressor(),
    },
    {
        'model_name':'KNeighborsRegressor',
        'params':{
            "n_neighbors":[3,5,7,9]
        },
        'model':KNeighborsRegressor(),
    }
]

# Cross Validation

In [51]:
evaluation = []
cv = KFold(5,shuffle=True,random_state=42)
for model in models:
    print("-----------")
    print(models['model_name'])
    
    grid = GridSearchCV(model['model'],param_grid=model['params'],cv=cv,return_train_score=True)
    grid.fit(X_train,y_train)
    print("\t", grid.best_score_)
    model['crossValidation'] = grid
    evaluation.append(model)

-----------
ElasticNet
	 0.7768796234897191
-----------
DecisionTreeRegressor
	 0.922499132941704
-----------
KNeighborsRegressor
	 0.9332460034923531


In [64]:
for model in models:
    results = pd.DataFrame(model['crossValidation'].cv_results_).query("rank_test_score == 1").iloc[0]
    model['mean_test_score'],model['std_test_score'] = results['mean_test_score'],results['std_test_score']
    model['mean_train_score'],model['std_train_score'] = results['mean_train_score'],results['std_train_score']

In [66]:
results = pd.DataFrame(models)
results[['model_name','mean_train_score','mean_test_score']]

Unnamed: 0,model_name,mean_train_score,mean_test_score
0,ElasticNet,0.777287,0.77688
1,DecisionTreeRegressor,0.944418,0.922499
2,KNeighborsRegressor,0.956527,0.933246
