In [46]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

In [47]:
from google.colab import drive
drive.mount('/content/drive')
titanic = '/content/drive/MyDrive/ML_Files/titanic_dataset.csv'
df = pd.read_csv(titanic)
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [48]:
df.drop( columns = ['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [49]:
# flow - t_t_split,

In [50]:
x_train, x_test, y_train, y_test = train_test_split( df.drop( columns = ['Survived']),
                                   df['Survived'], test_size = 0.2, random_state = 42)

In [51]:
# print( x_train,"\n",  x_test, y_train, y_test)
x_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [52]:
# NOTE- rather than calling col by name, call it by index in column transformer becz the o/p
# is a numpy array

In [53]:
# Imputation transformer - handling Missing data
trf1 = ColumnTransformer(
    [
        ('impute_age', SimpleImputer(), [2]),     # we wrote [2] rather than ['Age']
        ('impute_imbarked', SimpleImputer(strategy='most_frequent'), [6]),
    ], remainder = 'passthrough')

In [54]:
# one hot encoding
trf2 = ColumnTransformer(
    [
        ('ohe_sex_embarked', OneHotEncoder( sparse=False, handle_unknown='ignore'), [1,6])
    ], remainder = 'passthrough')

In [55]:
# now we have 10 cols - 7 - 2+ 5 (2 from sex, 3 from embarked)
# here +5 and not +3 becz we have

In [56]:
# scaling
trf3 = ColumnTransformer(
    [
        ('scale', MinMaxScaler(), slice(0,10)) # slice - 0 se 10 tak
    ]
)

In [57]:
# feature selection
trf4 = SelectKBest( score_func = chi2, k=8)
# no. of fisrt k cols to select

In [58]:
# traning model
trf5 = DecisionTreeClassifier()

Creating The Pipeline

In [59]:
pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5)
])

In [60]:
# alternate code - make_pipeline
# pipe = make_pipeline(trf1, trf2, trf3, trf4, trf5)

In [61]:
from sklearn import set_config
set_config(display='diagram')

In [62]:
pipe.fit(x_train, y_train)



In [63]:
pipe.named_steps
# returns dictionary , key- name of transf(thats why we used m1. -> line [14] ). val - working

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_imbarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf5': DecisionTreeClassifier()}

In [64]:
pipe.named_steps['trf1']

In [65]:
# predicting based on x_test
y_predicted = pipe.predict(x_test)
y_predicted

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0])

In [66]:
# comparing it with original output
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predicted)

0.6256983240223464

In [67]:
# export using pickle
import pickle
pickle.dump(pipe,  open('pipe.pkl', 'wb'))