In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.drop(['PassengerId','Cabin','Name','Ticket'],axis=1,inplace=True)

In [57]:
X_train,X_test,y_train,y_test = train_test_split(df.drop('Survived',axis=1),df['Survived'],test_size=0.2)
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
321,3,male,27.0,0,0,7.8958,S
441,3,male,20.0,0,0,9.5000,S
762,3,male,20.0,0,0,7.2292,C
660,1,male,50.0,2,0,133.6500,S
281,3,male,28.0,0,0,7.8542,S
...,...,...,...,...,...,...,...
293,3,female,24.0,0,0,8.8500,S
796,1,female,49.0,0,0,25.9292,S
637,2,male,31.0,1,1,26.2500,S
590,3,male,35.0,0,0,7.1250,S



## Imputing the columns

In [36]:
tnf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

## Encoding the categorical Data

In [61]:
tnf2 = ColumnTransformer([('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])],remainder='passthrough')

## Scaling the data

In [38]:
tnf3 = ColumnTransformer([('scale',MinMaxScaler(),slice(1,10))])

## Feature Selection

In [39]:
tnf4 = SelectKBest(score_func=chi2,k=8)

## Decision Tree Classifier

In [40]:
tnf5 = DecisionTreeClassifier()

# Making the Pipeline Object

In [41]:
pipe = Pipeline([
    (tnf1,'tnf1'),
     (tnf2,'tnf2'),
     (tnf3,'tnf3'),
     (tnf4,'tnf4'),
     (tnf5,'tnf5'),
]
)

## we can also use 'make_pipe'

In [62]:
pipe = make_pipeline(tnf1,tnf2,tnf3,tnf4,tnf5)

In [42]:
from sklearn import set_config 
set_config(display='diagram')

In [74]:
pipe.fit(X_train,y_train)

In [33]:
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(sparse=False), [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(1, 10, None))]),
 'selectkbest': SelectKBest(k=8, score_func=<function chi2 at 0x000002415D47D300>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [55]:
pipe.named_steps['columntransformer-1'].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [65]:
y_pred = pipe.predict(X_test)

In [66]:
y_pred

array([0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0], dtype=int64)

# Cross-Validation using Pipeline

In [68]:
from sklearn.model_selection import cross_val_score

In [73]:
cross_val_score(pipe,X_train,y_train,cv=5,scoring='accuracy').mean()

0.6263961390721954

In [72]:
import warnings
warnings.filterwarnings('ignore')

# Grid-Search using Pipeline

In [75]:
from sklearn.model_selection import GridSearchCV

In [79]:
params = {'decisiontreeclassifier__max_depth':[1,2,3,4,5,None]}

In [80]:
grid = GridSearchCV(pipe,params,cv=5,scoring='accuracy')
grid.fit(X_train,y_train)

In [81]:
grid.best_score_

0.6263961390721954

In [82]:
grid.best_params_

{'decisiontreeclassifier__max_depth': 4}

# Exporting the pipeline as pickle file

In [83]:
import pickle

In [85]:
pickle.dump(pipe,open('pipeline-1.pickle','wb'))