## Combining the Entire Scikit Workflow

* All data should be numerical
* No missing values
* Manipulate test set same as train set
* Never test on data you have trained on
* Tune hyperparams on validation set or use cross_validation
* 1 best performance metric doesn't mean the model is best. Try out other metrics

In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
data = pd.read_csv('../data/car-sales-extended-missing-data.csv')
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [4]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [7]:
# Get data ready
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Seeding
np.random.seed(2)

# Drop row with missing labels
data.dropna(subset=['Price'], inplace=True)

# Define different features & Transformer pipeline
categorical_features = ['Make', 'Colour']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

door_features = ['Doors']
door_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=4))
])

numeric_features = ['Odometer (KM)']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()) # Use default strategy - Filling with mean value of the col
])

# Setup preprocessing steps - Fill missing vals & convert to nums
preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_features),
    ('door', door_transformer, door_features),
    ('num', numeric_transformer, numeric_features)
])

# Combining preprocessing and modelling steps using a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# Split data
x = data.drop('Price', axis=1)
y = data['Price']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# Fit & score
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.3146316796860532

## Using `GridSearchCV` with `Pipelines`

In [26]:
pipe_grid = {
    # Accessing the 'strategy' going backwards in the pipeline
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    # Accessing model
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 5],
    'model__min_samples_split': [2, 4]
}

gs_model = GridSearchCV(
    estimator=model,
    param_grid=pipe_grid,
    cv=5,
    verbose=2
)

# This may take a minute
# gs_model.fit(x_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   1.2s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   1.1s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   1.1s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   1.2s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   1.2s
[CV] END model__max_depth=None, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=median; total time=   1.4s
[CV] END model__max_depth=None, model__min_samples_spli

[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=median; total time=   1.2s
[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=median; total time=   1.1s
[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=median; total time=   1.0s
[CV] END model__max_depth=5, model__min_samples_split=2, model__n_estimators=200, preprocessor__num__imputer__strategy=median; total time=   1.3s
[CV] END model__max_depth=5, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.8s
[CV] END model__max_depth=5, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.5s
[CV] END model__max_depth=5, model__min_samples_split=4, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; tot

In [28]:
gs_model.score(x_test, y_test)

0.38465493295259234