In [1]:
import pandas as pd
import numpy as np
from google.colab import drive

from sklearn.model_selection import train_test_split as TTS
from sklearn.impute import SimpleImputer as SI
from sklearn.preprocessing import OrdinalEncoder as OE
from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.tree import DecisionTreeClassifier as DTC

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
tt = pd.read_csv('/content/drive/MyDrive/Datasets/titanic.csv')
tt

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


From very first, we are going to preprocess our titanic dataset

Step 1. remove the unnecessary columns from titanic dataset

In [4]:
tt.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)
tt.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [5]:
# step 2 -> train test split

trainx, testx, trainy, testy = TTS(tt.iloc[:, 1:], tt.iloc[:, 0], test_size = 0.2, random_state = 1)

In [6]:
trainx

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
301,3,male,,2,0,23.2500,Q
309,1,female,30.0,0,0,56.9292,C
516,2,female,34.0,0,0,10.5000,S
120,2,male,21.0,2,0,73.5000,S
570,2,male,62.0,0,0,10.5000,S
...,...,...,...,...,...,...,...
715,3,male,19.0,0,0,7.6500,S
767,3,female,30.5,0,0,7.7500,Q
72,2,male,21.0,0,0,73.5000,S
235,3,female,,0,0,7.5500,S


In [7]:
# here we fill the missing values in the Age and Embarked column using SimpleImputer

siage = SI() # imputer class for Age column
siembarked = SI(strategy='most_frequent') # imputer class for Embarked column

trainx_age = siage.fit_transform(trainx[['Age']])
trainx_embarked = siembarked.fit_transform(trainx[['Embarked']])

testx_age = siage.transform(testx[['Age']])
testx_embarked = siembarked.transform(testx[['Embarked']])


In [8]:
# now we are going to implement OneHotEnocder on Sex and Embarked columns

ohe_sex = OHE(dtype = "int32")
ohe_embarked = OHE(dtype = "int32")

trainx_sex = ohe_sex.fit_transform(trainx[['Sex']])
trainx_embarked = ohe_embarked.fit_transform(trainx[['Embarked']])

testx_sex = ohe_sex.transform(testx[['Sex']])
testx_embarked = ohe_embarked.transform(testx[['Embarked']])

trainx_sex = trainx_sex.toarray()
trainx_embarked = trainx_embarked.toarray()

testx_sex = testx_sex.toarray()
testx_embarked = testx_embarked.toarray()

In [9]:
trainx_sex.shape

(712, 2)

In [10]:
tt.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [11]:
# now we'll drop Sex, Age and Embarked columns from trainx and testx, because, we have thrre array of Age, Sex and Embarked which are encoded and imputeded

trainx_remain = trainx.drop(columns = ['Sex', 'Age', 'Embarked'])
testx_remain = testx.drop(columns = ['Sex', 'Age', 'Embarked'])

In [12]:
trainx_remain

Unnamed: 0,Pclass,SibSp,Parch,Fare
301,3,2,0,23.2500
309,1,0,0,56.9292
516,2,0,0,10.5000
120,2,2,0,73.5000
570,2,0,0,10.5000
...,...,...,...,...
715,3,0,0,7.6500
767,3,0,0,7.7500
72,2,0,0,73.5000
235,3,0,0,7.5500


In [13]:
trainx_transform = np.concatenate((trainx_remain, trainx_sex, trainx_age, trainx_embarked), axis = 1)
testx_transform = np.concatenate((testx_remain, testx_sex, testx_age, testx_embarked), axis = 1)

In [14]:
trainx_transform

array([[3., 2., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [2., 0., 0., ..., 0., 1., 0.],
       ...,
       [2., 0., 0., ..., 0., 1., 0.],
       [3., 0., 0., ..., 0., 1., 0.],
       [3., 0., 0., ..., 0., 1., 0.]])

In [15]:
trainx_transform.shape

(712, 11)

In [16]:
testy

Unnamed: 0,Survived
862,1
223,0
84,1
680,0
535,1
...,...
796,1
815,0
629,0
421,0


In [17]:
# model training and testing

dtc = DTC(random_state = 1)
dtc.fit(trainx_transform, trainy)
dtc_predict = dtc.predict(testx_transform)

dtc_predict

array([1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 1])

In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(testy, dtc_predict)

0.7541899441340782

In [19]:
import pickle as pk

In [20]:
pk.dump(ohe_sex, open('ohe_sex.pkl', 'wb'))
pk.dump(ohe_embarked, open('ohe_embarked.pkl', 'wb'))
pk.dump(dtc, open('model.pkl', 'wb'))