In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [7]:
# import dataset
car_sales = pd.read_csv("../../dataset/car-sales-extended-missing-data.csv")

In [9]:
car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [11]:
car_sales.dropna(subset=['Price'], inplace=True)

In [13]:
car_sales.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [17]:
len(car_sales)

950

In [29]:
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 950 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           903 non-null    object 
 1   Colour         904 non-null    object 
 2   Odometer (KM)  902 non-null    float64
 3   Doors          903 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 44.5+ KB


In [39]:
car_sales['Doors'].value_counts()

Doors
4.0    768
5.0     71
3.0     64
Name: count, dtype: int64

In [41]:
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [91]:
# Setup random seed
np.random.seed(42)

# Define different feature and different transformer
categorycal_feature = ["Make", "Colour"]
categorical_transformer = Pipeline(steps=[
                        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
                        ("oneHot", OneHotEncoder(handle_unknown="ignore"))
                        ])

door_feature = ["Doors"]
door_transformer = Pipeline(steps=[
                        ("impute", SimpleImputer(strategy="constant", fill_value=4))
                        ])

numeric_feature = ['Odometer (KM)']
numeric_transformer = Pipeline(steps=[
                        ("imputer", SimpleImputer(strategy="mean"))  
                        ])

# setup perprocessing setp for missing value and turn category into number
preprocessor = ColumnTransformer([
                            ("category", categorical_transformer, categorycal_feature),
                            ("door", door_transformer, door_feature),
                            ("number", numeric_transformer, numeric_feature)
                            ])

# create preprocessing and modeling pipeline
model = Pipeline(steps=[
                ("preprocessing", preprocessor), 
                ("model", RandomForestRegressor())    
                ])

# Create X, y
X = car_sales.drop('Price', axis=1)
y = car_sales['Price']

# Split dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model.fit(X_train, y_train)
model.score(X_test, y_test)

0.22188417408787875

In [97]:
model.get_params()

{'memory': None,
 'steps': [('preprocessing',
   ColumnTransformer(transformers=[('category',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(fill_value='missing',
                                                                   strategy='constant')),
                                                    ('oneHot',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    ['Make', 'Colour']),
                                   ('door',
                                    Pipeline(steps=[('impute',
                                                     SimpleImputer(fill_value=4,
                                                                   strategy='constant'))]),
                                    ['Doors']),
                                   ('number',
                                    Pipeline(steps=[('imputer', SimpleImp

In [128]:
# Improve the model with GridSearchCV

para_grid = {
    'preprocessing__number__imputer__strategy': ['mean', 'median'],
    'model__n_estimators': [100, 1000],
    'model__max_depth': [None, 5],
    'model__max_features': ["sqrt", "log2"],
    'model__min_samples_split': [2, 4], 
    'model__min_samples_leaf': [1, 2] 
}

gs_model = GridSearchCV(model, para_grid, cv=5, verbose=2)

gs_model.fit(X_train, y_train)


Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessing__number__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessing__number__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessing__number__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=100, preprocessing__number__imputer__strategy=mean; total time=   0.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_

In [126]:
gs_model.score(X_test, y_test)

0.28090562243779293

In [134]:
# Improve the model with RandomizedSearchCV

param_distributions = {
    'preprocessing__number__imputer__strategy': ['mean', 'median'],
    'model__n_estimators': [100, 1000],
    'model__max_depth': [None, 5],
    'model__max_features': ["sqrt", "log2"],
    'model__min_samples_split': [2, 4], 
    'model__min_samples_leaf': [1, 2] 
}

rs_model = RandomizedSearchCV(model, param_distributions, n_iter=10, cv=5, verbose=2)

rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=100, preprocessing__number__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=100, preprocessing__number__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=100, preprocessing__number__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=4, model__n_estimators=100, preprocessing__number__imputer__strategy=mean; total time=   0.0s
[CV] END model__max_depth=None, model__max_features=log2, model__min_samples_leaf=2, model__min_samples_split=4, model__n_e

In [136]:
rs_model.score(X_test, y_test)

0.2943857867241384

In [138]:
# Save file
from joblib import dump
dump(rs_model, "rs_car_sales_regression_model.joblib")
print("Model Save Successfully")


Model Save Successfully
