In [1]:
import pandas as pd

data = pd.read_csv('data/car-sales-extended-missing-data.csv')
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [2]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [4]:
# get missing data count
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

refine with pipeline

1. fil missing values
2. convert strings to numbers

In [7]:
# gettind the data ready
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


#modellinng
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV

#set up random seed
import numpy as np
np.random.seed(42)


#import and drop rows with missing data
data = pd.read_csv('data/car-sales-extended-missing-data.csv')
data.dropna(subset=['Price'],inplace=True) 
# we did this on the price column because that is what we will predict, hence corresponding rows with missinng price has to be removed

# define features and transformer pipelines
categorical_features = ['Make', 'Colour']
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

door_feature = ['Doors']
door_transformer = Pipeline(
   steps=[ ('imputer', SimpleImputer(strategy='constant',fill_value=4))]
    # here we fill missing values with 4 because that is the majority in the column
)


numerical_features = ['Odometer (KM)']
numerical_transformer = Pipeline(
    steps= [
        ('imputer',SimpleImputer(strategy='mean'))
    ]
)

# set up pre processing steps: fill missing values and convert text to numbers

preprocessor = ColumnTransformer(
    transformers=[
    ('cat',categorical_transformer,categorical_features),
    ('door',door_transformer,door_feature),
    ('num',numerical_transformer, numerical_features)
    ]
)


# create preprocessing and modelling pipeline
model  = Pipeline(
    steps=[
    ('preprocessor',preprocessor),
    ('model',RandomForestRegressor())

    ]

)

# split the data
x = data.drop('Price',axis=1)
y = data['Price']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

model.fit(x_test,y_test)
model.score(x_test,y_test)

0.849957993113725

## using GridSearchCV or RandomisedSearchCV with the regression pipeline

In [11]:

pipe_grid = {
    "preprocessor__num__imputer__strategy" : ['mean','median'],
    "model__n_estimators" : [100,1000],
    "model__max_depth" : [None,5],
    "model__max_features": ['auto'],
    "model__min_samples_split": [2,4 ]
}

gs_model = GridSearchCV(model,pipe_grid,cv=5,verbose=2)
gs_model.fit(x_train,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   1.1s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.5s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.5s
[CV] END model__max_depth=None, model__max_features=auto, model__min_sampl

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        strategy='constant')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['Make',
                                                                          'Colour']),
                                                                        ('door',
         