In [170]:
import pandas as pd
import numpy as np                                                 # Numerical operations
from sklearn.compose import ColumnTransformer                      # Applies different preprocessing to different columns.
from sklearn.pipeline import Pipeline                              # Streamlines preprocessing and model training.
from sklearn.impute import SimpleImputer                           # Fills in missing data.
from sklearn.preprocessing import OneHotEncoder                    # Converts categorical data into a numerical format.

# Modeling 
from sklearn.ensemble import RandomForestRegressor                 # Predicts continuous values using multiple decision trees.
from sklearn.model_selection import train_test_split               # Splits data into training and testing sets. 

# ToDo
# 1) Import the data
# 2) Check the missing data
# 3) Check all the data is numerical 
# 4) Drop the column "Price" from the DataFrame and there is a Price section no longer exisist 
# 5) Conver the data in nums 

In [173]:
# 1) Import the data
data = pd.read_csv("data/car-sales-extended-missing-data.csv")
data.head(10)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
5,Honda,Red,42652.0,4.0,23883.0
6,Toyota,Blue,163453.0,4.0,8473.0
7,Honda,White,,4.0,20306.0
8,,White,130538.0,4.0,9374.0
9,Honda,Blue,51029.0,4.0,26683.0


In [175]:
# 2) Check the missing data
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [177]:
# 3) Check all the data is numerical
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [179]:
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [181]:
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [213]:
# FILL in the missing data 

# Categorical features pipeline
categorical_features = ["Make", "Colour"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Door feature pipeline
door_feature = ["Doors"]
door_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=4))
])

# Odometer (KM) feature pipeline
odometer_feature = ["Odometer (KM)"]
odometer_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy="mean"))
])


In [250]:
# Transform all the data into numbers
preprocessor = ColumnTransformer(
    transformers=[
        ("catg", categorical_transformer, categorical_features),
        ("door", door_transformer, door_feature),
        ("odo", odometer_transformer, odometer_feature),

    ])

In [256]:
# Create a preprocessing and modeling PipeLine 
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestRegressor()
)])

In [258]:
# Drop the NaN value of the Price
data.dropna(subset=["Price"], inplace=True)

X = data.drop("Price", axis = 1)
y = data["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit and score the model 
model.fit(X_train, y_train)

#Test the data 
model.score(X_test, y_test)

0.3050818636832555

# Use GridSearchCV with our regression PipeLine

In [261]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
pipe_grid = {
    "preprocessor__odo__imputer__strategy": ["mean", "median"],  # Impute strategy for odometer feature
    "classifier__n_estimators": [100, 1000],                     # Number of trees in the forest
    "classifier__max_depth": [None, 5],                           # Maximum depth of each tree
    "classifier__max_features": ["sqrt"],                         # Use 'sqrt' instead of 'auto'
    "classifier__min_samples_split": [2, 4]                       # Minimum samples required to split an internal node
}

# Create the GridSearchCV object
gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)

# Fit the model
gs_model.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_split=2, classifier__n_estimators=100, preprocessor__odo__imputer__strategy=mean; total time=   0.0s
[CV] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_split=2, classifier__n_estimators=100, preprocessor__odo__imputer__strategy=mean; total time=   0.1s
[CV] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_split=2, classifier__n_estimators=100, preprocessor__odo__imputer__strategy=mean; total time=   0.0s
[CV] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_split=2, classifier__n_estimators=100, preprocessor__odo__imputer__strategy=mean; total time=   0.0s
[CV] END classifier__max_depth=None, classifier__max_features=sqrt, classifier__min_samples_split=2, classifier__n_estimators=100, preprocessor__odo__imputer__strategy

In [263]:
# Score the model 
gs_model.score(X_test, y_test)

0.31394440759535336