In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, balanced_accuracy_score, f1_score, classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('Titanic.csv')
df = df.drop(columns=['Unnamed: 0'])

In [3]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.2500,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.9250,2
3,4,1,1,0,35.0,1,0,53.1000,2
4,5,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,1,30.0,0,0,8.0500,2
1305,1306,1,1,0,39.0,0,0,108.9000,0
1306,1307,0,3,1,38.5,0,0,7.2500,2
1307,1308,0,3,1,30.0,0,0,8.0500,2


## Splitting Dataset Untuk Train dan Test

- Train 90% Dari Data & Test 10% Dari Data

In [4]:
xtr, xts, ytr, yts = train_test_split(df[['Age','Sex','Pclass','SibSp','Parch','Embarked','Fare']],df['Survived'],test_size=0.1)

# K-Nearest Neighbors

In [22]:
model = KNeighborsClassifier()
model.fit(xtr,ytr)
df['Predict'] = model.predict(df[['Age','Sex','Pclass','SibSp','Parch','Embarked','Fare']])

In [23]:
model.score(xts,yts)

0.7099236641221374

# Parameter Tuning

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
- Leaf Size dan n_neighrbors berupa angka

In [8]:
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]

In [9]:
param = {'leaf_size':leaf_size, 'n_neighbors':n_neighbors, 'p':p}

# Randomized Search CV

- Mencari Parameter Terbaik

In [10]:
model = KNeighborsClassifier()
modelrs = RandomizedSearchCV(estimator=model, param_distributions=param, cv=5)

In [11]:
modelrs.fit(xtr,ytr)
modelrs.score(xts,yts)

0.6870229007633588

In [24]:
print(modelrs.best_params_)
p = modelrs.best_params_['p']
n = modelrs.best_params_['n_neighbors']
leaf = modelrs.best_params_['leaf_size']

{'p': 1, 'n_neighbors': 3, 'leaf_size': 5}


- Setelah Mendapat Parameter Terbaik, Gunakan Parameter Tersebut Supaya Hasil Model Menjadi Maksimal

In [25]:
modelbaru = KNeighborsClassifier(p=p,n_neighbors=n,leaf_size=leaf)
modelbaru.fit(xtr,ytr)
modelbaru.score(xts,yts)

0.6870229007633588

In [14]:
predict = modelbaru.predict(df[['Age','Sex','Pclass','SibSp','Parch','Embarked','Fare']])
df['Predict'] = predict
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Predict
0,1,0,3,1,22.0,1,0,7.2500,2,0
1,2,1,1,0,38.0,1,0,71.2833,0,1
2,3,1,3,0,26.0,0,0,7.9250,2,1
3,4,1,1,0,35.0,1,0,53.1000,2,1
4,5,0,3,1,35.0,0,0,8.0500,2,0
...,...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,1,30.0,0,0,8.0500,2,0
1305,1306,1,1,0,39.0,0,0,108.9000,0,1
1306,1307,0,3,1,38.5,0,0,7.2500,2,0
1307,1308,0,3,1,30.0,0,0,8.0500,2,0


In [15]:
tp,fn,fp,tn = confusion_matrix(df['Survived'],df['Predict']).ravel()
tp,fn,fp,tn

(737, 78, 129, 365)

## Confusion Matrix

- TP = True Positive : Model = 1, Prediksi = 1
- FN = False Negative : Model = 1 , Prediksi = 0
- FP = False Positive = Model = 0 , Prediksi = 1
- TN = True Negative = Model = 0 , Prediksi = 0

In [16]:
print('Akurasi :',(tp+tn)/(tp+tn+fp+fn))
print('Error Rate :',(fp+fn)/(tp+tn+fp+fn))
print('TP Rate / Recall (+) :',tp/(tp+fn))
print('FP Rate:',fp/(fp+tn))
print('TN Rate / Recall (-) :',tn/(fp+tn))
print('FN Rate :',fn/(tp+fn))
print('Precision (+) :',tp/(tp+fp))
print('Precision (-) :',tn/(tn+fn))
print('Prevalence :',(tp+fn)/(tp+tn+fp+fn))
print('Null Error Rate :',(fp+tn)/(tp+tn+fp+fn))
Precision = (tp/(tp+fp)) #Precision Positif
Recall = (tp/(tp+fn)) # Recall Positif
print('F1 Score :',2*((Precision*Recall)/(Precision+Recall)))

Akurasi : 0.8418640183346066
Error Rate : 0.15813598166539344
TP Rate / Recall (+) : 0.9042944785276074
FP Rate: 0.2611336032388664
TN Rate / Recall (-) : 0.7388663967611336
FN Rate : 0.09570552147239264
Precision (+) : 0.851039260969977
Precision (-) : 0.8239277652370203
Prevalence : 0.6226126814362108
Null Error Rate : 0.37738731856378915
F1 Score : 0.876859012492564


In [26]:
print('Model Score :',modelbaru.score(df[['Age','Sex','Pclass','SibSp','Parch','Embarked','Fare']],df['Survived']))
print('Accuracy Score :',accuracy_score(df['Survived'],df['Predict']))
print('Recall + :',recall_score(df['Survived'],df['Predict'],pos_label=0))
print('Recall - :',recall_score(df['Survived'],df['Predict'],pos_label=1))
print('Precision + :',precision_score(df['Survived'],df['Predict'],pos_label=0))
print('Precision - :',precision_score(df['Survived'],df['Predict'],pos_label=1))
print('Balanced Accuracy Score :',balanced_accuracy_score(df['Survived'],df['Predict']))
print('F1 Score + :',f1_score(df['Survived'],df['Predict'],pos_label=0))
print('F1 Score - :',f1_score(df['Survived'],df['Predict'],pos_label=1))

Model Score : 0.8418640183346066
Accuracy Score : 0.7891520244461421
Recall + : 0.8809815950920246
Recall - : 0.6376518218623481
Precision + : 0.8004459308807135
Precision - : 0.7645631067961165
Balanced Accuracy Score : 0.7593167084771864
F1 Score + : 0.838785046728972
F1 Score - : 0.695364238410596


In [18]:
roc_auc_score(df['Survived'],df['Predict'])

0.8215804376443705

In [19]:
print(classification_report(df['Survived'],df['Predict']))

              precision    recall  f1-score   support

           0       0.85      0.90      0.88       815
           1       0.82      0.74      0.78       494

    accuracy                           0.84      1309
   macro avg       0.84      0.82      0.83      1309
weighted avg       0.84      0.84      0.84      1309

