In [1]:
# importing the required libraries

import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import tree

In [2]:
# reading the data in a pandas DataFrame

url='https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv'
titanic = pd.read_csv(url)

In [3]:
# splitting the dependent and independent variables from dataset

X = titanic[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch','Fare']]
y = titanic['Survived']

In [4]:
X.dtypes

Pclass      int64
Sex        object
Age       float64
SibSp       int64
Parch       int64
Fare      float64
dtype: object

The 'Sex' column needs to be converted into numerical data.

In [5]:
X['Sex'] = (X['Sex']=='male').astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Sex'] = (X['Sex']=='male').astype('int')


In [6]:
print(X.describe())
print(y.describe())

           Pclass         Sex         Age       SibSp       Parch        Fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     2.308642    0.647587   29.699118    0.523008    0.381594   32.204208
std      0.836071    0.477990   14.526497    1.102743    0.806057   49.693429
min      1.000000    0.000000    0.420000    0.000000    0.000000    0.000000
25%      2.000000    0.000000   20.125000    0.000000    0.000000    7.910400
50%      3.000000    1.000000   28.000000    0.000000    0.000000   14.454200
75%      3.000000    1.000000   38.000000    1.000000    0.000000   31.000000
max      3.000000    1.000000   80.000000    8.000000    6.000000  512.329200
count    891.000000
mean       0.383838
std        0.486592
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: Survived, dtype: float64


We can see that 'Age' column has some missing values. Let's deal with missing values first.

In [7]:
# imputing the missing values in the 'Age' column with its median

impute = SimpleImputer(strategy='median')
t = impute.fit_transform(X[['Age']])
X['Age'] = pd.DataFrame(t)
X.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Age'] = pd.DataFrame(t)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,0.647587,29.361582,0.523008,0.381594,32.204208
std,0.836071,0.47799,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,0.42,0.0,0.0,0.0
25%,2.0,0.0,22.0,0.0,0.0,7.9104
50%,3.0,1.0,28.0,0.0,0.0,14.4542
75%,3.0,1.0,35.0,1.0,0.0,31.0
max,3.0,1.0,80.0,8.0,6.0,512.3292


Now that we have a data with no missing value, let's go ahead with building a simple decision tree without any hyper-parameters tuning

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=10)

model = tree.DecisionTreeClassifier()
model.fit(X_train,y_train)
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

0.9820359281437125
0.7937219730941704


We can see that there is a difference in accuracy of training and test data. So we can say that model is overfitted on training data.

Let's try to see if we can improve accuracy on test data

In [9]:
# we are tuning five hyperparameters right now, we are passing the different values for all the parameters

grid_param = {'criterion': ['gini', 'entropy'],'splitter' :['best','random'],'max_depth' : range(2,32,3),
              'min_samples_leaf' : range(1,10,2),'min_samples_split': range(0,10,2)}

In [10]:
# grid search cv model on the above hyperparameters

model1 = tree.DecisionTreeClassifier(presort=True)
grid_search = GridSearchCV(estimator=model1,param_grid=grid_param,cv=5,n_jobs =-1)

In [11]:
# fitting grid search cv model on training data

grid_search.fit(X_train,y_train)
b_params = grid_search.best_params_



In [12]:
print('Best accuracy on training data is %.2f'%grid_search.best_score_)

Best accuracy on training data is 0.82


In [13]:
# fitting trainig data with best parameters

model_f = tree.DecisionTreeClassifier(criterion = b_params['criterion'], max_depth = b_params['max_depth'],
                                      min_samples_leaf= b_params['min_samples_leaf'],
                                      min_samples_split= b_params['min_samples_split'],
                                      splitter =b_params['splitter'])
model_f.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=11, min_samples_split=6,
                       splitter='random')

In [14]:
print('Accuracy of best model on test data is %.2f'%model_f.score(X_test,y_test))

Accuracy of best model on test data is 0.80


As we see that test data accuracy is very nearer to the training data accuracy, we can say that the model we build is good.