#### 7. Revist the pipeline one more time

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

##### 7.1 Creating a regression Pipeline

In [2]:
data= pd.read_csv("car-sales-extended-missing-data.csv")
data

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [3]:
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [4]:
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

There are 1000 rows, 3 categorical features (Make, Colour, Doors), 2 numerical colomns (Odometer (KM), Price), 249 missing values

So, turn categorical features into numbers and fill missing values before fitting the model, to do this, build a Pipeline()

Input to Pipeline() is steps which is a list ([(step_name, action_to_take)]).

In our case, steps are:
- Fill missing data
- Convert data to numbers
- Build a model on the data

Let's do it!!!!

In [5]:
# Getting the data ready
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Setup random seed
import numpy as np
np.random.seed(42)

# Import data and drop rows with missing target values (labels)
data= pd.read_csv("car-sales-extended-missing-data.csv")
data.dropna(subset= ["Price"], inplace= True)

# define different features and transformer pipelines
cat_features= ["Make", "Colour"]
cat_transformer= Pipeline(steps= [
    ("imputer ", SimpleImputer(strategy= "constant", fill_value= "missing")),
    ("onehot", OneHotEncoder(handle_unknown= "ignore"))])
door_features= ["Doors"]
door_transformer= Pipeline(steps= [
    ("imputer", SimpleImputer(strategy= "constant", fill_value= 4))])
num_feature= ["Odometer (KM)"]
num_transformer= Pipeline(steps= [
    ("imputer", SimpleImputer(strategy= "mean"))])

# Setup preprocessing steps
preprocessor= ColumnTransformer(transformers= [
    ("cat", cat_transformer, cat_features),
    ("door", door_transformer, door_features),
    ("num", num_transformer, num_feature)])

# Create a preprocessing and modelling pipeline
model= Pipeline(steps= [("preprocessor", preprocessor),
                        ("model", RandomForestRegressor())])

# Split the data
X = data.drop("Price", axis= 1); y = data["Price"]
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2)

# Fit and score the moddel
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.22188417408787875

In [6]:
# using grid search with pipeline
pipe_grid= {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "model__n_estimators": [100, 1000],
    "model__max_depth": [None, 5],
    "model__max_features": ["auto", "sqrt"],
    "model__min_samples_split": [2, 4]
}

gs_model= GridSearchCV(model, pipe_grid, cv= 5, verbose= 2)
gs_model.fit(X_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.5s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.4s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samples_split=2, model__n_estimators=100, preprocessor__num__imputer__strategy=mean; total time=   0.3s
[CV] END model__max_depth=None, model__max_features=auto, model__min_samp

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('imputer ',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        strategy='constant')),
                                                                                         ('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['Make',
                                                                          'Colour']),
                                                                        ('door',
        

In [7]:
# score the best model
gs_model.score(X_test, y_test)

0.292308819012865