### Decision Tree will be used to predict survivors of the Titanic.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn
from pandas import Series, DataFrame
from pylab import rcParams
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics, tree

In [2]:
url='https://raw.githubusercontent.com/BigDataGal/Python-for-Data-Science/master/titanic-train.csv'
titanic=pd.read_csv(url)
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
#cleaning data
missing_values=titanic.isnull().sum()

In [5]:
missing_values[2::]

Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [6]:
total_missing=missing_values.sum()
total_cells=np.product(titanic.shape)
#percent of data that is missing
(missing_values/total_cells)*100

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            1.655443
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          6.425365
Embarked       0.018706
dtype: float64

In [7]:
clean_df=titanic.dropna()

In [8]:
#convert string values to categorical values
sex=pd.get_dummies(clean_df['Sex'], drop_first=True)
sex.head(5)

Unnamed: 0,male
1,0
3,0
6,1
10,0
11,0


In [9]:
#for embarked
embark=pd.get_dummies(clean_df["Embarked"])
embark.head(5)

Unnamed: 0,C,Q,S
1,1,0,0
3,0,0,1
6,0,0,1
10,0,0,1
11,0,0,1


In [10]:
#for embarked
embark=pd.get_dummies(clean_df["Embarked"], drop_first=True)
embark.head(5)

Unnamed: 0,Q,S
1,0,0
3,0,1
6,0,1
10,0,1
11,0,1


In [11]:
#for Pclass
Pcl=pd.get_dummies(clean_df["Pclass"], drop_first=True)
Pcl.head(5)

Unnamed: 0,2,3
1,0,0
3,0,0
6,0,0
10,0,1
11,0,0


In [12]:
clean_df=pd.concat([clean_df,sex,embark,Pcl], axis=1)

In [13]:
clean_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male,Q,S,2,3
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0,0,0,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0,0,1,0,0
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,0,1,0,0
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S,0,0,1,0,1
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S,0,0,1,0,0


In [14]:
#drop what we dont need
clean_df.drop(['Sex','Embarked','PassengerId','Name','Ticket','Pclass','Cabin'], axis=1,inplace=True)

In [15]:
clean_df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,male,Q,S,2,3
1,1,38.0,1,0,71.2833,0,0,0,0,0
3,1,35.0,1,0,53.1,0,0,1,0,0
6,0,54.0,0,0,51.8625,1,0,1,0,0
10,1,4.0,1,1,16.7,0,0,1,0,1
11,1,58.0,0,0,26.55,0,0,1,0,0


In [16]:
clean_df.isnull().sum()

Survived    0
Age         0
SibSp       0
Parch       0
Fare        0
male        0
Q           0
S           0
2           0
3           0
dtype: int64

In [17]:
#selecting features
X=clean_df.drop("Survived", axis=1)
y=clean_df['Survived']

In [18]:
#splitting the data 75% train vs 25% test
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.25, random_state=0)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(137, 9) (137,)
(46, 9) (46,)


In [19]:
#training the model and making predictions
dtree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
dtree.fit(X_train, y_train, check_input=True, X_idx_sorted=None)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [20]:
y_pred = dtree.predict(X_test)

In [22]:
#Performance of model
count_misclassified = (y_test != y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 9
Accuracy: 0.80


In [23]:
#classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.50      1.00      0.67         9
          1       1.00      0.76      0.86        37

avg / total       0.90      0.80      0.82        46



The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.


The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.


The F1 score can be interpreted as a weighted average of the precision and recall.

The support is the number of occurrences of each class in y_true.