In [39]:
import pandas as pd
import seaborn as sns
pd.set_option("display.max_rows", 6)

### Load Titanic Data


In [40]:
titanic = sns.load_dataset("titanic")
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.7500,Q,Third,man,True,,Queenstown,no,True


## Implememting sklearn pipelines

### 1. Features with same datatype

In [43]:
columns_to_include = ['sex', 'pclass', 'survived']
data = titanic[columns_to_include]

# split features and labels
X = data.drop("survived", axis=1)
y = data["survived"]

In [42]:
data

Unnamed: 0,sex,pclass,survived
0,male,3,0
1,female,1,1
2,female,3,1
...,...,...,...
888,female,3,0
889,male,1,1
890,male,3,0


In [44]:
# necessary imports
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# defining classification pipeline steps
clf_steps = [("ModeImputation", SimpleImputer(strategy='most_frequent')),
             ('Encoding', OneHotEncoder()),
             ('KNN', KNeighborsClassifier())]

# instantiating pipeline
clf_pipeline = Pipeline(clf_steps)

# cross-validating the pipeline
cross_val_score(clf_pipeline, X, y, cv=5).mean()

0.7867993220764548

### 2. Columns with Mixed Data Type

In [45]:
columns_to_include = ['sex', 'pclass','age', 'survived']
data = titanic[columns_to_include]
data

# split features and labels
X = data.drop("survived", axis=1)
y = data["survived"]

In [46]:
data

Unnamed: 0,sex,pclass,age,survived
0,male,3,22.0,0
1,female,1,38.0,1
2,female,3,26.0,1
...,...,...,...,...
888,female,3,,0
889,male,1,26.0,1
890,male,3,32.0,0


In [47]:
# necessary imports
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# numeric features preprocessing pipeline
num_feature_preprocessing = Pipeline([("MeanImputation", SimpleImputer(strategy='mean')),
                                      ("Scaler", MinMaxScaler())])

# categorical features preprocessing pipeline
cat_feature_preprocessing = Pipeline([("ModeImputation", SimpleImputer(strategy='most_frequent')),
                                      ("Encoding", OneHotEncoder())])                                          

# applying the above pipelines to relevant columns
pre_processing = ColumnTransformer([("NumFeaturesPreprocessing", num_feature_preprocessing, ["age"]),
                                    ("CatFeaturesPreprocessing", cat_feature_preprocessing, ["sex", "pclass"])],
                                    remainder='passthrough')

# classfication pipeline steps
clf_steps = [('PreProcessing', pre_processing),
             ('KNN', KNeighborsClassifier())]

# instantiate pipeline
clf_pipeline = Pipeline(clf_steps)

# cross-validate the pipeline
cross_val_score(clf_pipeline, X, y, cv=5).mean()

0.8014123407193523

### 3. Add custom transformation 

Some transformations are not available in sklearn e.g. log transformation. We can create a custom transformation using `FunctionTransformer()` as illustrated in below code in numeric features preprocessing pipeline.

In [48]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np

# numeric features preprocessing pipeline - here we used FunctionTransformer() for logarithm
num_feature_preprocessing = Pipeline([("MeanImputation", SimpleImputer(strategy='mean')),
                                      ("Log", FunctionTransformer(np.log10))])

# categorical features preprocessing pipeline
cat_feature_preprocessing = Pipeline([("ModeImputation", SimpleImputer(strategy='most_frequent')),
                                      ("Encoding", OneHotEncoder())])                                          

# applying the above pipelines to relevant columns
pre_processing = ColumnTransformer([("NumFeaturesPreprocessing", num_feature_preprocessing, ["age"]),
                                    ("CatFeaturesPreprocessing", cat_feature_preprocessing, ["sex", "pclass"])],
                                    remainder='passthrough')

# classfication pipeline steps
clf_steps = [('PreProcessing', pre_processing),('KNN', KNeighborsClassifier())]

# instantiate pipeline
clf_pipeline = Pipeline(clf_steps)

# cross-validate the pipeline
cross_val_score(clf_pipeline, X, y, cv=5).mean()

0.8036281463812692

### Grid Search 

We can also optimize the steps as well as hyperparameters using `GridSearchCV()`.

In [49]:
# necessary imports
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import numpy as np

# numeric features preprocessing pipeline
num_feature_preprocessing = Pipeline([("MeanImputation", SimpleImputer(strategy='mean')),
                                      ("Scaler", MinMaxScaler())])

# categorical features preprocessing pipeline
cat_feature_preprocessing = Pipeline([("ModeImputation", SimpleImputer(strategy='most_frequent')),
                                      ("Encoding", OneHotEncoder())])                                          

# applying the above pipelines to relevant columns
pre_processing = ColumnTransformer([("NumFeaturesPreprocessing", num_feature_preprocessing, ["age"]),
                                    ("CatFeaturesPreprocessing", cat_feature_preprocessing, ["sex", "pclass"])],
                                    remainder='passthrough')

# classfication pipeline steps
clf_steps = [('PreProcessing', pre_processing),
             ('KNN', KNeighborsClassifier())]

# instantiate pipeline
clf_pipeline = Pipeline(clf_steps)

# define parameter grid 
param_grid = {"PreProcessing__CatFeaturesPreprocessing__Encoding":[OneHotEncoder(), OrdinalEncoder()],
               "KNN__n_neighbors":np.arange(1, 50, 2)}

# perfrom grid search cross validation on the pipeline
grid_search = GridSearchCV(clf_pipeline, param_grid, cv=5)

# identify the optimum parameter set
grid_search.fit(X, y).best_params_

{'KNN__n_neighbors': 11,
 'PreProcessing__CatFeaturesPreprocessing__Encoding': OneHotEncoder()}

In [50]:
grid_search.best_score_

0.8024982738057874