In [1]:
import pandas as pd
import numpy as np


In [27]:
df=pd.read_csv("./train.csv")
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [3]:
df.isna().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [28]:
df=df.drop(columns=['Name','Ticket','Fare','Cabin'],axis=1)

In [29]:
df=df[['Age','Embarked','Pclass','Sex','SibSp','Parch','Survived']]

In [30]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df.drop(columns=['Survived']),df['Survived'],random_state=111,test_size=0.25)


In [32]:
X_train.isna().sum()

Unnamed: 0,0
Age,125
Embarked,2
Pclass,0
Sex,0
SibSp,0
Parch,0


In [37]:
X_train.sample(5)

Unnamed: 0,Age,Embarked,Pclass,Sex,SibSp,Parch
123,32.5,S,2,female,0,0
604,35.0,C,1,male,0,0
785,25.0,S,3,male,0,0
658,23.0,S,2,male,0,0
297,2.0,S,1,female,1,2


In [50]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest,chi2
from sklearn import set_config

## Simple Imputer

In [38]:
trf1=ColumnTransformer(
        [
          ('impute,age',SimpleImputer(strategy='mean'),[0]),
          ('impute_embark',SimpleImputer(strategy='most_frequent'),[1])
         ], remainder='passthrough'
        )

## OHE on Embarked and Sex Column

In [39]:
trf2=ColumnTransformer(
    [
        ('ohe',OneHotEncoder(handle_unknown='ignore',sparse_output=False),[1,3])
    ],remainder='passthrough'
)

## Scaling all the columns

In [40]:
trf3=ColumnTransformer(
    [
        ('scaling',MinMaxScaler(),slice(0,10))
    ],remainder='passthrough'
)

## Feature Selection

In [51]:
trf4=SelectKBest(chi2, k=8)

## Classifier

In [52]:
trf5=DecisionTreeClassifier()

In [53]:
pipe=Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

## Show the flowchart of the pipeline

In [54]:
set_config(display="diagram")
pipe

In [55]:
pipe.fit(X_train,y_train)

## Note
- If classifier is not part of the pipeline, the instead of fit, fit_transform is called on the training data

In [56]:
y_pred=pipe.predict(X_test)

In [57]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred=y_pred,y_true=y_test)

0.820627802690583

## Steps in the Pipeline

In [47]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute,age', SimpleImputer(), [0]),
                                 ('impute_embark',
                                  SimpleImputer(strategy='most_frequent'),
                                  [1])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 3])]),
 'trf3': ColumnTransformer(remainder='passthrough',
                   transformers=[('scaling', MinMaxScaler(),
                                  slice(0, 10, None))]),
 'trf4': DecisionTreeClassifier()}

## Crossvalidation using Pipeline

In [74]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(pipe,X_train,y_train,cv=10,scoring='accuracy')

score

array([0.82089552, 0.76119403, 0.79104478, 0.80597015, 0.79104478,
       0.74626866, 0.89552239, 0.74626866, 0.75757576, 0.75757576])