In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, balanced_accuracy_score, f1_score, classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('Titanic.csv')
df = df.drop(columns=['Unnamed: 0'])

In [3]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.2500,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.9250,2
3,4,1,1,0,35.0,1,0,53.1000,2
4,5,0,3,1,35.0,0,0,8.0500,2
...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,1,30.0,0,0,8.0500,2
1305,1306,1,1,0,39.0,0,0,108.9000,0
1306,1307,0,3,1,38.5,0,0,7.2500,2
1307,1308,0,3,1,30.0,0,0,8.0500,2


## Splitting Dataset

- Train 90%
- Test 10%

In [4]:
xtr, xts, ytr, yts = train_test_split(df[['Age','Sex','Pclass','SibSp','Parch','Embarked','Fare']],df['Survived'],test_size=0.1)

# Logistic Regression

In [5]:
model = LogisticRegression()
model.fit(xtr,ytr)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
print(model.score(xtr,ytr))

0.8531409168081494


# Parameter Tuning

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
- Penalty yang Tersedia = none, elastic-net, l1, l2
- Solver yang tersedia = lbfgs, liblinear, sag, saga, newton-cg
- Max iter berupa angka = menentukan model berapa kali jalan

In [7]:
penalty = ['l2','l1','elastic-net','none']
solver = ['lbfgs','liblinear','sag','saga','newton-cg']
max_iter = [1,10,100,1000,10000]
param = {'penalty':penalty,'solver':solver,'max_iter':max_iter}

# Randomized Search CV

- Mencari Parameter Terbaik

In [8]:
model = LogisticRegression()
modelrs = RandomizedSearchCV(estimator=model, param_distributions=param, cv=5)

In [9]:
modelrs.fit(xtr,ytr)
modelrs.score(xts,yts)

ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got elastic-net.

ValueError: Logistic Regression supports only penalties in ['l1', 'l2', 'elasticnet', 'none'], got elastic-net.



0.8625954198473282

In [18]:
print(modelrs.best_params_)
solve = modelrs.best_params_['solver']
penal = modelrs.best_params_['penalty']
iter = modelrs.best_params_['max_iter']

{'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 10000}


- Setelah Mendapat Parameter Terbaik, Gunakan Parameter Tersebut Agar Hasil Model Menjadi Maksimal

In [11]:
modelbaru = LogisticRegression(
    solver=solve,penalty=penal,max_iter=iter)
modelbaru.fit(xtr,ytr)
modelbaru.score(xts,yts)

0.8625954198473282

In [12]:
predict = modelbaru.predict(df[['Age','Sex','Pclass','SibSp','Parch','Embarked','Fare']])
df['Predict'] = predict
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Predict
0,1,0,3,1,22.0,1,0,7.2500,2,0
1,2,1,1,0,38.0,1,0,71.2833,0,1
2,3,1,3,0,26.0,0,0,7.9250,2,1
3,4,1,1,0,35.0,1,0,53.1000,2,1
4,5,0,3,1,35.0,0,0,8.0500,2,0
...,...,...,...,...,...,...,...,...,...,...
1304,1305,0,3,1,30.0,0,0,8.0500,2,0
1305,1306,1,1,0,39.0,0,0,108.9000,0,1
1306,1307,0,3,1,38.5,0,0,7.2500,2,0
1307,1308,0,3,1,30.0,0,0,8.0500,2,0


In [13]:
tp,fn,fp,tn = confusion_matrix(df['Survived'],df['Predict']).ravel()
tp,fn,fp,tn

(735, 80, 110, 384)

## Confusion Matrix

- TP = True Positive : Model = 1, Prediksi = 1
- FN = False Negative : Model = 1 , Prediksi = 0
- FP = False Positive = Model = 0 , Prediksi = 1
- TN = True Negative = Model = 0 , Prediksi = 0

In [14]:
print('Akurasi :',(tp+tn)/(tp+tn+fp+fn))
print('Error Rate :',(fp+fn)/(tp+tn+fp+fn))
print('TP Rate / Recall (+) :',tp/(tp+fn))
print('FP Rate:',fp/(fp+tn))
print('TN Rate / Recall (-) :',tn/(fp+tn))
print('FN Rate :',fn/(tp+fn))
print('Precision (+) :',tp/(tp+fp))
print('Precision (-) :',tn/(tn+fn))
print('Prevalence :',(tp+fn)/(tp+tn+fp+fn))
print('Null Error Rate :',(fp+tn)/(tp+tn+fp+fn))
Precision = (tp/(tp+fp)) #Precision Positif
Recall = (tp/(tp+fn)) # Recall Positif
print('F1 Score :',2*((Precision*Recall)/(Precision+Recall)))

Akurasi : 0.8548510313216195
Error Rate : 0.14514896867838045
TP Rate / Recall (+) : 0.901840490797546
FP Rate: 0.22267206477732793
TN Rate / Recall (-) : 0.7773279352226721
FN Rate : 0.09815950920245399
Precision (+) : 0.8698224852071006
Precision (-) : 0.8275862068965517
Prevalence : 0.6226126814362108
Null Error Rate : 0.37738731856378915
F1 Score : 0.8855421686746988


In [19]:
print('Model Score :',modelbaru.score(df[['Age','Sex','Pclass','SibSp','Parch','Embarked','Fare']],df['Survived']))
print('Accuracy Score :',accuracy_score(df['Survived'],df['Predict']))
print('Recall + :',recall_score(df['Survived'],df['Predict'],pos_label=0))
print('Recall - :',recall_score(df['Survived'],df['Predict'],pos_label=1))
print('Precision + :',precision_score(df['Survived'],df['Predict'],pos_label=0))
print('Precision - :',precision_score(df['Survived'],df['Predict'],pos_label=1))
print('Balanced Accuracy Score :',balanced_accuracy_score(df['Survived'],df['Predict']))
print('F1 Score + :',f1_score(df['Survived'],df['Predict'],pos_label=0))
print('F1 Score - :',f1_score(df['Survived'],df['Predict'],pos_label=1))

Model Score : 0.8548510313216195
Accuracy Score : 0.8548510313216195
Recall + : 0.901840490797546
Recall - : 0.7773279352226721
Precision + : 0.8698224852071006
Precision - : 0.8275862068965517
Balanced Accuracy Score : 0.839584213010109
F1 Score + : 0.8855421686746988
F1 Score - : 0.801670146137787


In [16]:
roc_auc_score(df['Survived'],df['Predict'])

0.8395842130101091

In [17]:
print(classification_report(df['Survived'],df['Predict']))

              precision    recall  f1-score   support

           0       0.87      0.90      0.89       815
           1       0.83      0.78      0.80       494

    accuracy                           0.85      1309
   macro avg       0.85      0.84      0.84      1309
weighted avg       0.85      0.85      0.85      1309

