In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, balanced_accuracy_score, f1_score, classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('Titanic.csv')
df = df.drop(columns=['Unnamed: 0'])
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.2500,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.9250,2
3,4,1,1,0,35.0,1,0,53.1000,2
4,5,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,1,30.0,0,0,8.0500,2
1305,1306,1,1,0,39.0,0,0,108.9000,0
1306,1307,0,3,1,38.5,0,0,7.2500,2
1307,1308,0,3,1,30.0,0,0,8.0500,2


### Data Splitting Untuk Train & Test
- Train 90% & Test 10%

In [3]:
xtr, xts, ytr, yts = train_test_split(df[['Age','Sex','Pclass','SibSp','Parch','Embarked','Fare']],df['Survived'],test_size=0.1)

# Random Forest

In [4]:
model = RandomForestClassifier()
model.fit(xtr,ytr)
model.score(xts,yts)

0.8396946564885496

# Parameter Tuning

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
- n_estimator berupa angka, menentukan jumlah pohonnya
- Criterion yang Tersedia : Gini & Entropy

In [5]:
n_estimators = list(range(0,101))
criterion = ['gini','entropy']
param = {'n_estimators':n_estimators,'criterion':criterion}

# Randomized Search CV

- Mencari Parameter Terbaik

In [6]:
model = RandomForestClassifier()
modelrs = RandomizedSearchCV(estimator=model, param_distributions=param, cv=5)
modelrs.fit(xtr,ytr)

ValueError: n_estimators must be greater than zero, got 0.

ValueError: n_estimators must be greater than zero, got 0.

ValueError: n_estimators must be greater than zero, got 0.

ValueError: n_estimators must be greater than zero, got 0.

ValueError: n_estimators must be greater than zero, got 0.



RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [17]:
print(modelrs.score(xts,yts))

0.8473282442748091


In [16]:
print(modelrs.best_params_)
est = modelrs.best_params_['n_estimators']
crit = modelrs.best_params_['criterion']

{'n_estimators': 17, 'criterion': 'entropy'}


- Setelah Mendapat Parameter Terbaik, Gunakan Parameter Tersebut Supaya Hasil Dari Model Menjadi Maksimal

In [9]:
modelbaru = RandomForestClassifier(n_estimators=est,criterion=crit)
modelbaru.fit(xtr,ytr)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=17,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [10]:
print(modelbaru.score(xts,yts))
df['Predict'] = modelbaru.predict(df[['Age','Sex','Pclass','SibSp','Parch','Embarked','Fare']])

0.8549618320610687


## Confusion Matrix

- TP = True Positive : Model = 1, Prediksi = 1
- FN = False Negative : Model = 1 , Prediksi = 0
- FP = False Positive = Model = 0 , Prediksi = 1
- TN = True Negative = Model = 0 , Prediksi = 0

In [11]:
tp,fn,fp,tn = confusion_matrix(df['Survived'],df['Predict']).ravel()
tp,fn,fp,tn

(793, 22, 33, 461)

In [12]:
print('Akurasi :',(tp+tn)/(tp+tn+fp+fn))
print('Error Rate :',(fp+fn)/(tp+tn+fp+fn))
print('TP Rate / Recall (+) :',tp/(tp+fn))
print('FP Rate:',fp/(fp+tn))
print('TN Rate / Recall (-) :',tn/(fp+tn))
print('FN Rate :',fn/(tp+fn))
print('Precision (+) :',tp/(tp+fp))
print('Precision (-) :',tn/(tn+fn))
print('Prevalence :',(tp+fn)/(tp+tn+fp+fn))
print('Null Error Rate :',(fp+tn)/(tp+tn+fp+fn))
Precision = (tp/(tp+fp)) #Precision Positif
Recall = (tp/(tp+fn)) # Recall Positif
print('F1 Score :',2*((Precision*Recall)/(Precision+Recall)))

Akurasi : 0.957983193277311
Error Rate : 0.04201680672268908
TP Rate / Recall (+) : 0.9730061349693252
FP Rate: 0.06680161943319839
TN Rate / Recall (-) : 0.9331983805668016
FN Rate : 0.026993865030674847
Precision (+) : 0.9600484261501211
Precision (-) : 0.9544513457556936
Prevalence : 0.6226126814362108
Null Error Rate : 0.37738731856378915
F1 Score : 0.9664838513101768


In [19]:
print('Model Score :',modelbaru.score(df[['Age','Sex','Pclass','SibSp','Parch','Embarked','Fare']],df['Survived']))
print('Accuracy Score :',accuracy_score(df['Survived'],df['Predict']))
print('Recall + :',recall_score(df['Survived'],df['Predict'],pos_label=0))
print('Recall - :',recall_score(df['Survived'],df['Predict'],pos_label=1))
print('Precision + :',precision_score(df['Survived'],df['Predict'],pos_label=0))
print('Precision - :',precision_score(df['Survived'],df['Predict'],pos_label=1))
print('Balanced Accuracy Score :',balanced_accuracy_score(df['Survived'],df['Predict']))
print('F1 Score + :',f1_score(df['Survived'],df['Predict'],pos_label=0))
print('F1 Score - :',f1_score(df['Survived'],df['Predict'],pos_label=1))

Model Score : 0.957983193277311
Accuracy Score : 0.957983193277311
Recall + : 0.9730061349693252
Recall - : 0.9331983805668016
Precision + : 0.9600484261501211
Precision - : 0.9544513457556936
Balanced Accuracy Score : 0.9531022577680635
F1 Score + : 0.9664838513101768
F1 Score - : 0.9437052200614124


In [14]:
roc_auc_score(df['Survived'],df['Predict'])

0.9531022577680635

In [15]:
print(classification_report(df['Survived'],df['Predict']))

              precision    recall  f1-score   support

           0       0.96      0.97      0.97       815
           1       0.95      0.93      0.94       494

    accuracy                           0.96      1309
   macro avg       0.96      0.95      0.96      1309
weighted avg       0.96      0.96      0.96      1309

