In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, balanced_accuracy_score, f1_score, classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('Titanic.csv')
df = df.drop(columns=['Unnamed: 0'])

In [3]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.2500,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.9250,2
3,4,1,1,0,35.0,1,0,53.1000,2
4,5,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,1,28.0,0,0,8.0500,2
1305,1306,1,1,0,39.0,0,0,108.9000,0
1306,1307,0,3,1,38.5,0,0,7.2500,2
1307,1308,0,3,1,28.0,0,0,8.0500,2


In [6]:
xtr, xts, ytr, yts = train_test_split(df[['Age','Sex','Pclass','SibSp','Parch','Embarked','Fare']],df['Survived'],test_size=0.1)

# Tree Decision

In [12]:
model = tree.DecisionTreeClassifier()
model.fit(xtr,ytr)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [15]:
print(model.score(xts,yts))
print(cross_val_score(model,xtr,ytr,cv=5))
print(np.mean(cross_val_score(model,xtr,ytr)))

0.7633587786259542
[0.78813559 0.79237288 0.82627119 0.83404255 0.83404255]
0.8115578795528309


# Parameter Tuning

In [16]:
criterion = ['gini','entropy']
splitter = ['best','random']
param = {'criterion':criterion,'splitter':splitter}

# Randomized Search CV

In [18]:
model = tree.DecisionTreeClassifier()
modelrs = RandomizedSearchCV(estimator=model, param_distributions=param, cv=5)

In [20]:
modelrs.fit(xtr,ytr)
print(modelrs.score(xts,yts))

0.7709923664122137




In [21]:
modelrs.best_params_

{'splitter': 'best', 'criterion': 'gini'}

In [23]:
modelbaru = tree.DecisionTreeClassifier(splitter='best',criterion='gini')
modelbaru.fit(xtr,ytr)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [25]:
predict = modelbaru.predict(df[['Age','Sex','Pclass','SibSp','Parch','Embarked','Fare']])
df['Predict'] = predict
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Predict
0,1,0,3,1,22.0,1,0,7.2500,2,0
1,2,1,1,0,38.0,1,0,71.2833,0,1
2,3,1,3,0,26.0,0,0,7.9250,2,0
3,4,1,1,0,35.0,1,0,53.1000,2,1
4,5,0,3,1,35.0,0,0,8.0500,2,0
...,...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,1,28.0,0,0,8.0500,2,0
1305,1306,1,1,0,39.0,0,0,108.9000,0,1
1306,1307,0,3,1,38.5,0,0,7.2500,2,0
1307,1308,0,3,1,28.0,0,0,8.0500,2,0


In [24]:
modelbaru.score(xts,yts)

0.7862595419847328

In [28]:
tp,fn,fp,tn = confusion_matrix(df['Survived'],df['Predict']).ravel()
tp,fn,fp,tn

(802, 13, 38, 456)

In [29]:
print('Akurasi :',(tp+tn)/(tp+tn+fp+fn))
print('Error Rate :',(fp+fn)/(tp+tn+fp+fn))
print('TP Rate / Recall (+) :',tp/(tp+fn))
print('FP Rate:',fp/(fp+tn))
print('TN Rate / Recall (-) :',tn/(fp+tn))
print('FN Rate :',fn/(tp+fn))
print('Precision (+) :',tp/(tp+fp))
print('Precision (-) :',tn/(tn+fn))
print('Prevalence :',(tp+fn)/(tp+tn+fp+fn))
print('Null Error Rate :',(fp+tn)/(tp+tn+fp+fn))
Precision = (tp/(tp+fp)) #Precision Positif
Recall = (tp/(tp+fn)) # Recall Positif
print('F1 Score :',2*((Precision*Recall)/(Precision+Recall)))

Akurasi : 0.961038961038961
Error Rate : 0.03896103896103896
TP Rate / Recall (+) : 0.9840490797546012
FP Rate: 0.07692307692307693
TN Rate / Recall (-) : 0.9230769230769231
FN Rate : 0.015950920245398775
Precision (+) : 0.9547619047619048
Precision (-) : 0.9722814498933902
Prevalence : 0.6226126814362108
Null Error Rate : 0.37738731856378915
F1 Score : 0.9691842900302116


In [30]:
print('Model Score :',modelbaru.score(df[['Age','Sex','Pclass','SibSp','Parch','Embarked','Fare']],df['Survived']))
print('Accuracy Score :',accuracy_score(df['Survived'],df['Predict']))
print('Recall + :',recall_score(df['Survived'],df['Predict'],pos_label=1))
print('Recall - :',recall_score(df['Survived'],df['Predict'],pos_label=0))
print('Precision + :',precision_score(df['Survived'],df['Predict'],pos_label=1))
print('Precision - :',precision_score(df['Survived'],df['Predict'],pos_label=0))
print('Balanced Accuracy Score :',balanced_accuracy_score(df['Survived'],df['Predict']))
print('F1 Score + :',f1_score(df['Survived'],df['Predict'],pos_label=1))
print('F1 Score - :',f1_score(df['Survived'],df['Predict'],pos_label=0))

Model Score : 0.961038961038961
Accuracy Score : 0.961038961038961
Recall + : 0.9230769230769231
Recall - : 0.9840490797546012
Precision + : 0.9722814498933902
Precision - : 0.9547619047619048
Balanced Accuracy Score : 0.9535630014157621
F1 Score + : 0.9470404984423677
F1 Score - : 0.9691842900302116


In [31]:
roc_auc_score(df['Survived'],df['Predict'])

0.9535630014157621

In [32]:
print(classification_report(df['Survived'],df['Predict']))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97       815
           1       0.97      0.92      0.95       494

    accuracy                           0.96      1309
   macro avg       0.96      0.95      0.96      1309
weighted avg       0.96      0.96      0.96      1309

