### Predict Titanic Survival Decision Trees

#### Import Modules

In [131]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

#### Import Data

In [132]:
passengers = pd.read_csv('passengers.csv')

#### Explore Data

In [133]:
passengers.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [134]:
passengers.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

#### Clean Data 

In [135]:
passengers['Sex-int'] = passengers['Sex'].apply(lambda row: 0 if row == 'male' else 1)
passengers.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex-int
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1


#### Fill NaN Values

In [136]:
passengers['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [137]:
passengers['Age'].isnull().sum()

177

In [138]:
passengers['Age'].mean()

29.69911764705882

In [139]:
passengers['Age'].fillna(passengers['Age'].mean(), inplace =True)

In [140]:
passengers['Age']

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

In [141]:
passengers['Age'].isnull().sum()

0

In [142]:
passengers_dummies = pd.get_dummies(passengers['Pclass'])
passengers_dummies.head()

Unnamed: 0,1,2,3
0,0,0,1
1,1,0,0
2,0,0,1
3,1,0,0
4,0,0,1


In [143]:
passengers = pd.concat([passengers, passengers_dummies], axis ='columns')

In [144]:
passengers.rename(columns= {1:'FirstClass', 2:'SecondClass', 3:'ThirdClass'}, inplace =True)
passengers.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex-int,FirstClass,SecondClass,ThirdClass
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,1,0,0


#### Select and Split the Data

In [155]:
# Columns 'FirstClass', 'SecondClass', 'ThirdClass' are dependent.
# One column needs to be droped in the model features.

In [145]:
passengers_features = passengers[[
    'Sex-int',
    'Age',
    'FirstClass',
    'SecondClass',    
]]
passengers_features.head()

Unnamed: 0,Sex-int,Age,FirstClass,SecondClass
0,0,22.0,0,0
1,1,38.0,1,0
2,1,26.0,0,0
3,1,35.0,1,0
4,0,35.0,0,0


In [146]:
passengers_labels = passengers['Survived']
passengers_labels

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [147]:
train_features, test_features, train_labels, test_labels =\
train_test_split(passengers_features, passengers_labels)

In [148]:
print(len(train_features),len(train_labels))

668 668


#### Create and Evaluate the Model

In [149]:
classifier = DecisionTreeClassifier()

In [150]:
classifier.fit(train_features, train_labels)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [151]:
print("Training set score: {:.3f}".format(classifier.score(train_features, train_labels)))
print("Test set score: {:.3f}".format(classifier.score(test_features, test_labels)))

Training set score: 0.874
Test set score: 0.816


#### Predict with the Model

In [152]:
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
You  = np.array([0.0,33.0,0.0,0.0])

sample_passengers = np.array([Jack, Rose, You])

In [153]:
classifier.predict(sample_passengers)

array([0, 1, 0], dtype=int64)

In [154]:
classifier.predict_proba(sample_passengers)

array([[0.76923077, 0.23076923],
       [0.        , 1.        ],
       [1.        , 0.        ]])