## <b>Sklearn Pipelines</b>

Pipelines chain together multiple steps so that output of one stage is used as the input of the next stage.\
Pipelines makes it easy to apply the same preprecessing to test and split.

### Without using the pipelines

In [380]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [381]:
titanic=sns.load_dataset('titanic')

In [382]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [383]:
titanic=titanic[['survived','pclass','sex','age','sibsp','parch','fare','embarked']]

In [384]:
titanic.isnull().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [385]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [386]:
x_train,x_test,y_train,y_test=train_test_split(titanic.drop(columns=['survived']),titanic['survived'],test_size=0.2,random_state=2)

In [387]:
x_test.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
707,1,male,42.0,0,0,26.2875,S
37,3,male,21.0,0,0,8.05,S
615,2,female,24.0,1,2,65.0,S
169,3,male,28.0,0,0,56.4958,S
68,3,female,17.0,4,2,7.925,S


In [388]:
x_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
30,1,male,40.0,0,0,27.7208,C
10,3,female,4.0,1,1,16.7,S
873,3,male,47.0,0,0,9.0,S
182,3,male,9.0,4,2,31.3875,S
876,3,male,20.0,0,0,9.8458,S


In [389]:
si_age=SimpleImputer()
si_embarked=SimpleImputer(strategy='most_frequent')
train_age=si_age.fit_transform(x_train[['age']])
train_embarked=si_embarked.fit_transform(x_train[['embarked']])
test_age=si_age.fit_transform(x_test[['age']])
test_embarked=si_embarked.fit_transform(x_test[['embarked']])

#titanic['age']=age.ravel()
#titanic['embarked']=embarked.ravel()ravel is just like doing [:,0]

In [390]:
titanic['embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [391]:
#one hot encoding
ohe_sex=OneHotEncoder(drop='first',handle_unknown='ignore')
ohe_embarked=OneHotEncoder(drop='first',handle_unknown='ignore')
train_sex=ohe_sex.fit(x_train[['sex']])
train_embarked=ohe_embarked.fit(x_train[['embarked']])
train_sex=ohe_sex.transform(x_train[['sex']]).toarray()
train_embarked=ohe_embarked.transform(x_train[['embarked']]).toarray()
test_sex=ohe_sex.transform(x_test[['sex']]).toarray()
test_embarked=ohe_embarked.transform(x_test[['embarked']]).toarray()


In [392]:
train_rem=x_train.drop(columns=['age','embarked','sex'])
test_rem=x_test.drop(columns=['age','embarked','sex'])

In [393]:
train_transfomed=np.concatenate((train_age,train_embarked,train_sex,train_rem),axis=1)
test_transfomed=np.concatenate((test_age,test_embarked,test_sex,test_rem),axis=1)

In [394]:
train_transfomed.shape

(712, 9)

In [395]:
test_transfomed.shape

(179, 9)

In [396]:
clf=DecisionTreeClassifier()
clf.fit(train_transfomed,y_train)

In [397]:
y_pred=clf.predict(test_transfomed)

In [398]:
from sklearn.metrics import accuracy_score,f1_score

In [399]:
accuracy_score(y_test,y_pred)

0.7877094972067039

In [400]:
f1_score(y_test,y_pred)

0.75

In [401]:
import pickle


In [402]:
#pickle.dump(ohe_sex,open('ohe_sex.pkl','wb'))

#pickle.dump(ohe_embarked,open('ohe_embarked.pkl','wb'))
#pickle.dump(clf,open('clf.pkl','wb'))

### USING SKLEARN PIPELINES



In [403]:
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.compose import ColumnTransformer

In [404]:
x_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
30,1,male,40.0,0,0,27.7208,C
10,3,female,4.0,1,1,16.7,S
873,3,male,47.0,0,0,9.0,S
182,3,male,9.0,4,2,31.3875,S
876,3,male,20.0,0,0,9.8458,S


In [405]:
y_train.head()

30     0
10     1
873    0
182    0
876    0
Name: survived, dtype: int64

In [406]:
#inputer
tf1=ColumnTransformer(transformers=[
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')


In [407]:
#one hot encoding
tf2=ColumnTransformer(transformers=
                      [
                          ('ohe',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False),[1,6]),
                      ],remainder='passthrough')


In [408]:
#scaling
tf3=ColumnTransformer(transformers=[
    ('scale',MinMaxScaler(),slice(0,9))
])

In [409]:
#feature selction
tf4=SelectKBest(score_func=chi2,k=5)

In [410]:
#train the model
tf5=DecisionTreeClassifier()

### Create Pipeline

In [411]:
#alternate synatx
pipe=make_pipeline(tf1,tf2,tf3,tf4,tf5)

### pipeline vs make_pipeline
make_pipeline does not need the name of the each steps

In [412]:
pipe.fit(x_train,y_train)

In [413]:
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe',
                                  OneHotEncoder(drop='first',
                                                handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 9, None))]),
 'selectkbest': SelectKBest(k=5, score_func=<function chi2 at 0x0000026C2167BA60>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [414]:
pipe.named_steps['columntransformer-1'].named_transformers_['impute_age'].statistics_

array([29.78590426])

In [415]:
pipe.named_steps['columntransformer-1'].named_transformers_['impute_embarked'].statistics_

array(['S'], dtype=object)

In [416]:
y_pred=pipe.predict(x_test)



In [417]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [418]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.5586592178770949

In [420]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,x_train,y_train,cv=5,scoring='accuracy').mean()



np.float64(0.6306214911848714)

In [421]:
pickle.dump(pipe,open('pipe.pll','wb'))