In [2]:
#Loading Data and modules 
 
import numpy as np  
import pandas as pd 
import seaborn as sb 
import matplotlib.pyplot as plt 
from pylab import rcParams 
from sklearn import preprocessing 
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 
from sklearn import metrics

In [8]:
titanic = pd.read_csv('titanic.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,Braund,Mr. Owen Harris,male,22.0,1,0,A5 21171,7.25,,S
2,1,1,Cumings,Mrs. John Bradley (Florence Briggs Thayer),female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,Heikkinen,Miss. Laina,female,26.0,0,0,STONO2. 3101282,7.925,,S
4,1,1,Futrelle,Mrs. Jacques Heath (Lily May Peel),female,35.0,1,0,113803,53.1,C123,S
5,0,3,Allen,Mr. William Henry,male,35.0,0,0,373450,8.05,,S


In [43]:
# selecting independent and dependent variables
X = titanic.iloc[:,[2,4,5,6,7,9]]
y = titanic['Survived']

In [44]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
1,Braund,male,22.0,1,0,7.25
2,Cumings,female,38.0,1,0,71.2833
3,Heikkinen,female,26.0,0,0,7.925
4,Futrelle,female,35.0,1,0,53.1
5,Allen,male,35.0,0,0,8.05


In [45]:
# Data Exploration
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 6 columns):
Pclass    891 non-null object
Sex       891 non-null object
Age       714 non-null float64
SibSp     891 non-null int64
Parch     891 non-null int64
Fare      891 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 88.7+ KB


In [46]:
X.isnull().sum() # Total number of missing values per column

Pclass      0
Sex         0
Age       177
SibSp       0
Parch       0
Fare        0
dtype: int64

In [47]:
X.describe(include='all')

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
count,891,891,714.0,891.0,891.0,891.0
unique,667,2,,,,
top,Andersson,male,,,,
freq,9,577,,,,
mean,,,29.699118,0.523008,0.381594,32.204208
std,,,14.526497,1.102743,0.806057,49.693429
min,,,0.42,0.0,0.0,0.0
25%,,,20.125,0.0,0.0,7.9104
50%,,,28.0,0.0,0.0,14.4542
75%,,,38.0,1.0,0.0,31.0


In [48]:
# In pclass we have 667 unique categories
X['Pclass'].unique()

array(['Braund', 'Cumings', 'Heikkinen', 'Futrelle', 'Allen', 'Moran',
       'McCarthy', 'Palsson', 'Johnson', 'Nasser', 'Sandstrom', 'Bonnell',
       'Saundercock', 'Andersson', 'Vestrom', 'Hewlett', 'Rice',
       'Williams', 'Vander Planke', 'Masselmani', 'Fynney', 'Beesley',
       'McGowan', 'Sloper', 'Asplund', 'Emir', 'Fortune', "O'Dwyer",
       'Todoroff', 'Uruchurtu', 'Spencer', 'Glynn', 'Wheadon', 'Meyer',
       'Holverson', 'Mamee', 'Cann', 'Nicola-Yarred', 'Ahlin', 'Turpin',
       'Kraeff', 'Laroche', 'Devaney', 'Rogers', 'Lennon', "O'Driscoll",
       'Samaan', 'Arnold-Franchi', 'Panula', 'Nosworthy', 'Harper',
       'Faunthorpe', 'Ostby', 'Woolner', 'Rugg', 'Novel', 'West',
       'Goodwin', 'Sirayanian', 'Icard', 'Harris', 'Skoog', 'Stewart',
       'Moubarek', 'Nye', 'Crease', 'Kink', 'Jenkin', 'Hood',
       'Chronopoulos', 'Bing', 'Moen', 'Staneff', 'Moutal', 'Caldwell',
       'Dowdell', 'Waelens', 'Sheerlinck', 'McDermott', 'Carrau', 'Ilett',
       'Backstrom

In [49]:
X['Pclass'].nunique()

667

In [50]:
X  = X.drop('Pclass',axis=1)

In [51]:
X.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare
1,male,22.0,1,0,7.25
2,female,38.0,1,0,71.2833
3,female,26.0,0,0,7.925
4,female,35.0,1,0,53.1
5,male,35.0,0,0,8.05


In [54]:
d = {'male':1,'female':0}
X['Sex'] = X['Sex'].replace(d)

In [55]:
X.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare
1,1,22.0,1,0,7.25
2,0,38.0,1,0,71.2833
3,0,26.0,0,0,7.925
4,0,35.0,1,0,53.1
5,1,35.0,0,0,8.05


In [59]:
# Missing value treatment
mis = X.groupby(by=['Sex'])['Age'].transform('mean')
X['Age'] = X['Age'].fillna(mis)

In [60]:
# Splitting the data into train and test
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=30)

In [61]:
# Model training on decision tree
from sklearn import tree
cls_tree = tree.DecisionTreeClassifier()

In [62]:
cls_tree.fit(xtrain,ytrain)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [63]:
# Prediction
ypred = cls_tree.predict(xtest)

In [64]:
# model evaluation
metrics.accuracy_score(ytest,ypred)

1.0

In [66]:
print(metrics.classification_report(ytest,ypred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         4
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00        23

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

