# Without Pipeline

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
407,408,1,2,"Richards, Master. William Rowe",male,3.0,1,1,29106,18.75,,S
234,235,0,2,"Leyson, Mr. Robert William Norman",male,24.0,0,0,C.A. 29566,10.5,,S
790,791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q
724,725,1,1,"Chambers, Mr. Norman Campbell",male,27.0,1,0,113806,53.1,E8,S
667,668,0,3,"Rommetvedt, Mr. Knud Paust",male,,0,0,312993,7.775,,S


In [4]:
df.drop(columns=['PassengerId','Name','Ticket','Fare','Cabin'],inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Embarked  889 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 48.9+ KB


In [6]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Embarked      2
dtype: int64

In [7]:
df


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S
...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,S
887,1,1,female,19.0,0,0,S
888,0,3,female,,1,2,S
889,1,1,male,26.0,0,0,C


In [8]:
df['family'] = df['SibSp'] + df['Parch']

In [9]:
df.drop(columns=['SibSp' , 'Parch'],inplace=True)

In [10]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,family
0,0,3,male,22.0,S,1
1,1,1,female,38.0,C,1
2,1,3,female,26.0,S,0
3,1,1,female,35.0,S,1
4,0,3,male,35.0,S,0
...,...,...,...,...,...,...
886,0,2,male,27.0,S,0
887,1,1,female,19.0,S,0
888,0,3,female,,S,3
889,1,1,male,26.0,C,0


In [11]:
X_train , X_test , y_train ,y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.3)

In [12]:
si_age = SimpleImputer()
si_embarked = SimpleImputer(strategy='most_frequent')

X_train_age = si_age.fit_transform(X_train[['Age']])
X_train_embarked = si_embarked.fit_transform(X_train[['Embarked']])

X_test_age = si_age.transform(X_test[['Age']])
X_test_embarked = si_embarked.transform(X_test[['Embarked']])


In [13]:
X_train.shape

(623, 5)

In [14]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         132
Embarked      2
family        0
dtype: int64

In [15]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Embarked,family
135,2,male,23.0,C,0
86,3,male,16.0,S,4
781,1,female,17.0,S,1
786,3,female,18.0,S,0
120,2,male,21.0,S,2


In [16]:
ohe_sex = OneHotEncoder(drop='first' , sparse_output=False , handle_unknown='ignore')
ohe_embarked = OneHotEncoder(drop='first',sparse_output=False , handle_unknown='ignore')

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked = ohe_embarked.fit_transform(X_train[['Embarked']])

X_test_sex = ohe_sex.transform(X_test[['Sex']])
X_test_embarked = ohe_embarked.transform(X_test[['Embarked']])

In [17]:
X_train_embarked

array([[0., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [18]:
X_train

Unnamed: 0,Pclass,Sex,Age,Embarked,family
135,2,male,23.0,C,0
86,3,male,16.0,S,4
781,1,female,17.0,S,1
786,3,female,18.0,S,0
120,2,male,21.0,S,2
...,...,...,...,...,...
19,3,female,,C,0
446,2,female,13.0,S,1
612,3,female,,Q,1
211,2,female,35.0,S,0


In [19]:
X_train_rem = X_train.drop(columns=['Sex','Age','Embarked'])
X_test_rem = X_test.drop(columns=['Sex','Age','Embarked'])
X_train_embarked.shape


(623, 3)

In [20]:
X_train_tranformed = np.concatenate((X_train_rem,X_train_age,X_train_sex,X_train_embarked),axis=1)
X_test_tranformed = np.concatenate((X_test_rem,X_test_age,X_test_sex,X_test_embarked),axis=1)



In [21]:
X_train_tranformed.shape
X_test_tranformed.shape

(268, 7)

In [22]:
X_train_tranformed

array([[ 2.        ,  0.        , 23.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 3.        ,  4.        , 16.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  1.        , 17.        , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 3.        ,  1.        , 29.39393075, ...,  1.        ,
         0.        ,  0.        ],
       [ 2.        ,  0.        , 35.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 2.        ,  0.        , 18.        , ...,  0.        ,
         1.        ,  0.        ]])

In [23]:
clf = DecisionTreeClassifier()
clf.fit(X_train_tranformed,y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [24]:
y_predict = clf.predict(X_test_tranformed)
y_predict

array([1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1])

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
accuracy_score(y_test,y_predict)

0.7910447761194029

In [27]:
import pickle

In [28]:
pickle.dump(ohe_sex,open('models/ohe_sex.pkl','wb'))
pickle.dump(ohe_embarked,open('models/ohe_embarked.pkl','wb'))
pickle.dump(clf,open ('models/clf.pkl','wb'))


# With Pipelines and Column Transformer

In [59]:
from sklearn.pipeline import Pipeline

In [31]:
df = pd.read_csv('titanic.csv',usecols=['Survived','Pclass','Sex','Age','SibSp','Parch','Embarked'])

In [34]:
df['Family'] = df['SibSp'] + df['Parch']

In [35]:
df.drop(columns=['SibSp','Parch'],inplace=True)

In [37]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Embarked,Family
0,0,3,male,22.0,S,1
1,1,1,female,38.0,C,1
2,1,3,female,26.0,S,0
3,1,1,female,35.0,S,1
4,0,3,male,35.0,S,0


In [40]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Embarked      2
Family        0
dtype: int64

In [41]:
X_train , X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=0)

In [47]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,Embarked,Family
140,3,female,,C,2
439,2,male,31.0,S,0
817,2,male,31.0,C,2
378,3,male,20.0,C,0
491,3,male,21.0,S,0


In [42]:
from sklearn.compose import ColumnTransformer

In [72]:
# Imputation 

tnf1= ColumnTransformer(transformers=[
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[3])
],remainder='passthrough')

In [73]:
# one hot encoding

tnf2 = ColumnTransformer(transformers=[
    ('ohe_sex',OneHotEncoder(sparse_output=False,drop='first',handle_unknown='ignore'),[1]),
    ('ohe_embarked',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),[3])
],remainder='passthrough')

In [74]:
# scaler

tnf3= ColumnTransformer(transformers=[
    ('scaler',MinMaxScaler(),slice(0,7))
    
],remainder='passthrough')

In [75]:
tnf4 = DecisionTreeClassifier()

In [76]:
pipe = Pipeline([
    ('tnf1',tnf1),
    ('tnf2',tnf2),
    ('tnf3',tnf3),
    ('tnf4',tnf4)
])

In [83]:
pipe.fit(X_train,y_train)

0,1,2
,steps,"[('tnf1', ...), ('tnf2', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('impute_age', ...), ('impute_embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('ohe_sex', ...), ('ohe_embarked', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,transformers,"[('scaler', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [80]:
y_predict = pipe.predict(X_test)

In [81]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_predict,y_test)

0.8100558659217877