In [71]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [8]:
df = pd.read_csv(r"C:\Users\uk51002689\OneDrive - Friedhelm Loh Group\Desktop\100\d-sklearn-pipelines\train.csv")

In [12]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# Let's Plan 

In [18]:
df.drop(columns=['PassengerId','Name','Cabin','Ticket'], inplace = True)

In [20]:
# Step 1 -> train/test/split

X_train,X_test,Y_train,Y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.25)

In [49]:
Y_train.head()


683    0
707    1
741    0
433    0
23     1
Name: Survived, dtype: int64

In [25]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
683,3,male,14.0,5,2,46.9,S
707,1,male,42.0,0,0,26.2875,S
741,1,male,36.0,1,0,78.85,S
433,3,male,17.0,0,0,7.125,S
23,1,male,28.0,0,0,35.5,S


In [29]:
#checking the missing values
X_train.isnull().sum()

Pclass        0
Sex           0
Age         132
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [32]:
# imputation transformer for filling up the missing values

transformer1 = ColumnTransformer([
    ('agefiller', SimpleImputer(),[2]),
    ('embarkfiller', SimpleImputer(strategy = 'most_frequent'),[6])
    ], remainder = 'passthrough')

# all the missing values are now filled using SimpleImputer methord


In [59]:
# applying one hot encoding on sex, embarked column because both of the data is categorical 

transformer2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [60]:
# Appplying Min max scaler to scale the values

transformer3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
], )

In [62]:
# Feature selection
transformer4 = SelectKBest(score_func=chi2,k=8)

In [63]:
# train the model
transformer5 = DecisionTreeClassifier()

# Create Pipeline


In [64]:
pipe = Pipeline([
    ('transformer1',transformer1),
    ('transformer2',transformer2),
    ('transformer3',transformer3),
    ('transformer4',transformer4),
    ('transformer5',transformer5)
])

In [67]:
# Display Pipeline

from sklearn import set_config
set_config(display='diagram')

In [66]:
# train
pipe.fit(X_train,Y_train)

# Exploring the Pipe features


In [68]:
pipe.named_steps

{'transformer1': ColumnTransformer(remainder='passthrough',
                   transformers=[('agefiller', SimpleImputer(), [2]),
                                 ('embarkfiller',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'transformer2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'transformer3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'transformer4': SelectKBest(k=8, score_func=<function chi2 at 0x0000014267C10DC0>),
 'transformer5': DecisionTreeClassifier()}

In [69]:
y_pred = pipe.predict(X_test)

In [70]:
y_pred

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0], dtype=int64)

In [73]:
accuracy_score(Y_test,y_pred)

0.6860986547085202

In [80]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, Y_train, cv=20, scoring='accuracy').mean()

0.6196969696969696

# export 


In [82]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))