In [18]:
import numpy as np
import pandas as pd

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

In [20]:
df = pd.read_csv("../train.csv")
df.head(1)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S


### Dropping unnecessary cols

In [21]:
df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], inplace=True)

df.head(1)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S


### Train Test split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=["Survived"]),
    df["Survived"],
    test_size=0.2,
    random_state=42
)

X_train.head(1)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S


In [23]:
y_train.sample(5)

734    0
829    1
116    0
635    1
576    1
Name: Survived, dtype: int64

### Creating transformers

In [24]:
# imputation transformer

trf1 = ColumnTransformer([
    # ("impute_age", SimpleImputer(), ["Age"])
    ("impute_age", SimpleImputer(), [2]), # didn't give name because in a pipeline when the data move from one tranformer to other it is not a dataframe but a numpy array
    ("impute_embarked", SimpleImputer(strategy="most_frequent"), [6])
], remainder="passthrough") # remainder passthrough to retain other columns when passed further in the pipeline 

In [25]:
# one hot encoder transformer
trf2 = ColumnTransformer([
    ("one_hot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), [1, 3])
], remainder="passthrough")

In [26]:
# scaling transformer

trf3 = ColumnTransformer([
    ("scale", MinMaxScaler(), slice(0, 10)) # applying scaling to all the 10 columns produced after the previous two transformers
], remainder="passthrough")

In [27]:
# feature selection

trf4 = SelectKBest(chi2, k=5)

In [28]:
# train the model

trf5 = DecisionTreeClassifier()

### Create a pipeline having all these transformers

In [29]:
pipe = Pipeline([
    ("trf1", trf1),
    ("trf2", trf2),
    ("trf3", trf3),
    ("trf4", trf4),
    ("trf5", trf5)
])

# alternate way to create pipeline (using make_pipeline)

# pipe = make_pipeline(trf1, trf2, trf3, trf4, trf5)

# Pipeline requires naming of steps, make_pipeline does not
# Same for ColumnTransformer vs make_column_transformer
# if using make_column_transformer you will not have to give the name

In [30]:
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('trf1', ...), ('trf2', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('impute_age', ...), ('impute_embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('one_hot', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('scale', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,score_func,<function chi...0024CD8422200>
,k,5

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Predict

In [31]:
y_pred = pipe.predict(X_test)
y_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1])

In [32]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8044692737430168

### Exporting the pipeline to use in production code

In [33]:
import pickle
pickle.dump(pipe, open("pipe.pkl", "wb"))