In [1]:
import numpy as np 
import matplotlib.pyplot as plt
import matplotlib.image as pltimg
import pandas as pd 

In [2]:
from sklearn.tree import DecisionTreeClassifier
import pydotplus
from sklearn.tree import export_graphviz

# Decision Tree Classifier

In [3]:
#load the data 
titanic = pd.read_csv("Titanic.csv")

shuffled = titanic 
shuffled = shuffled.sample(frac=1)

#map data and fill NaN values
shuffled['age'] = shuffled['age'].fillna(30)
pclass = {'1st': 1, '2nd': 2, '3rd': 3}
sex = {'female': 0, 'male': 1}

shuffled['pclass'] = shuffled['pclass'].map(pclass)
shuffled['sex'] = shuffled['sex'].map(sex)

#split the data into training and testing 
split = int(.8*len(shuffled))
train = shuffled[:split]
test = shuffled[split:]

train.head()
#x is independent and y is dependent 
features = ['pclass', 'sex', 'age', 'sibsp']
target = ['survived']

X = train[features]
Y = train['survived']

x_test = test[features]
y_test = test['survived']

dtree = DecisionTreeClassifier()
dtree = dtree.fit(X, Y)


export_graphviz(
            dtree,
            out_file=("final_tree.dot"),
            feature_names=features,
            class_names= ['no','yes'],
            rounded=True,
            filled=True
        )

# GridSearchCV

In [4]:
from sklearn.model_selection import GridSearchCV

params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3)
grid_search_cv.fit(X, Y)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14, 15, 16, 17, 18, 19, 20, 21,
                                            22, 23, 24, 25, 26, 27, 28, 29, 30,
                                            31, ...],
                         'min_samples_split': [2, 3, 4]},
             verbose=1)

In [5]:
print(grid_search_cv.best_estimator_)
print('the optimmum number of the max_leaf_nodes parameter is 27')

DecisionTreeClassifier(max_leaf_nodes=10, random_state=42)
the optimmum number of the max_leaf_nodes parameter is 27


In [6]:
export_graphviz(
            grid_search_cv.best_estimator_,
            out_file=("gridsearch.dot"),
            feature_names=features,
            class_names= ['no','yes'],
            rounded=True,
            filled=True
        )

In [7]:
from sklearn.metrics import accuracy_score
yhat_test = grid_search_cv.best_estimator_.predict(x_test)
# Compute accuracy based on test samples
acc = accuracy_score(y_test, yhat_test)
print(acc)

0.8091603053435115


In [8]:
results = pd.DataFrame(data=y_test)
results = results.rename(columns={"survived": "y_test"})
results['yhat_test'] = yhat_test
display(results)
#percent survivors correctly predicted (on test set)
#results['y_test']==0
# fin = pd.DataFrame(results['y_test']==0)
# display(fin)

zeros = results[(results["y_test"] == 0) & (results["yhat_test"] == 0)]
print('Accuracy of Pruned Tree:')
print('fatalities correctly predicted:',len(zeros)/len(yhat_test))
ones = results[(results["y_test"] == 1) & (results["yhat_test"] == 1)]
print('survivals correctly predicted:',len(ones)/len(yhat_test))
#percent fatalities correctly predicted (on test set)

Unnamed: 0,y_test,yhat_test
160,1,1
450,0,0
228,0,1
360,1,0
288,1,1
...,...,...
674,1,0
247,1,1
1044,1,1
1221,0,0


Accuracy of Pruned Tree:
fatalities correctly predicted: 0.5114503816793893
survivals correctly predicted: 0.29770992366412213


In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
rnd_clf = RandomForestClassifier(n_estimators=50, max_leaf_nodes=27, n_jobs=-1)
rnd_clf.fit(X, Y)

y_pred_rf = rnd_clf.predict(x_test)
 
will = pd.DataFrame(columns=['pred','real'])
will['real'] = y_test
will['pred'] = y_pred_rf
will

zeros = will[(will["pred"] == 0) & (will["real"] == 0)]
print('Accuracy of random forest ')
print('fatalities correctly predicted:',len(zeros)/len(y_test))
ones = will[(will["pred"] == 1) & (will["real"] == 1)]
print('survivals correctly predicted:',len(ones)/len(y_test))

Accuracy of random forest 
fatalities correctly predicted: 0.5190839694656488
survivals correctly predicted: 0.29389312977099236
