In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Note for KY - Jon and Zhiyong you can kind of get the context here too
Once we get the dataset to be perfectly fine, lets test against different types of k-means clusters 
- Potential suggestion: Explore the dataset to see if there is any form of PCA needed

Find the number of k with the lowest WSS, so that we're able to cluster them accordingly.
After that, we can use the elbow method to find the best k value for the dataset and cluster them accordingly.

After that, we manually analyse the data and from there we are able to then further group the data into two groups.

In [None]:
app = pd.read_cv("application.csv")

# Check if the data is balanced or not, if it is not we will have to do something to it
app['insert_column_here'].value_counts()

In [None]:
X = app.drop('insert_column_here', axis=1)
y = app['insert_column_here']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# For each model

# We simplify the training of the model part
def result(X_train, X_test, y_train, model):
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:, 1]
    return y_pred
    

In [None]:
# Example for Logistic Regression
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg_result = result(X_train, X_test, y_train, log_reg)

In [None]:
# After we have the result for the few models we're using, we can optimise the hyperparameters
# For example, we can use GridSearchCV to optimise the hyperparameters for Logistic Regression.
# How do you find the parameters? Documentation

from sklearn.model_selection import GridSearchCV

# THIS IS AN EXAMPLE. ADD HOW YOU DEEM FIT.
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 1000, 2500, 5000]
}

clf = GridSearchCV(log_reg, param_grid=param_grid)

# After that, save this into a model and run result() again
y_pred = result(X_train, X_test, y_train, clf)


After this, we use the ROC AUC score if there is an imbalance in dataset. We try not to use accuracy.

There are also other scores like Precision and Recall, those are usable as well. 

However, let's look at the formulas for precision and recall:
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)

The cost of having a false positive is higher than a false negative as we can deny the loan to a person who is not a defaulter, but we cannot give a loan to a person who is a defaulter.

Therefore we should seek to minimise FP > minimise FN, and therefore the higher the precision, the better.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# Insert your model here
model = None

y_pred = model.predict_proba(X_test)[:, 1]

In [None]:
thresholds = np.linspace(0,1,101)

scores = []

for t in thresholds:
    actual_positive = (y_test == 1)
    actual_negative = (y_test == 0)

    pred_positive = (y_pred >= t)
    pred_negative = (y_pred < t)

    tp = (actual_positive & pred_positive).sum()
    tn = (actual_negative & pred_negative).sum()
    fp = (pred_positive & actual_negative).sum()
    fn = (pred_negative & actual_positive).sum()

    scores.append((t, tp, fp, fn, tn))
    
columns = ['threshold', 'tp', 'fp', 'fn', 'tn']
scores_data = pd.DataFrame(scores, columns = columns)

In [None]:
scores_data['tpr'] = scores_data.tp / (scores_data.tp + scores_data.fn)
scores_data['fpr'] = scores_data.fp / (scores_data.fp + scores_data.tn)

plt.plot(scores_data.threshold, scores_data.tpr, label = 'tpr')
plt.plot(scores_data.threshold, scores_data.fpr, label = 'fpr')
plt.legend()

In [None]:
# Make conclusions on the graph pls 

scores_data['precision'] = scores_data.tp / (scores_data.tp + scores_data.fp)
scores_data['recall'] = scores_data.tp / (scores_data.tp + scores_data.fn)

plt.plot(scores_data.threshold, scores_data.precision, label = 'precision')
plt.plot(scores_data.threshold, scores_data.recall, label = 'recall')
plt.legend()