<a href="https://colab.research.google.com/github/johntanas/it1244project/blob/main/code/model_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [92]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,RepeatedStratifiedKFold,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [93]:
path_to_max_label="/content/max_label.csv"
path_to_app="/content/cleaned_application.csv"
cleaned_app_df=pd.read_csv(path_to_app,index_col=0)
max_df=pd.read_csv(path_to_max_label,index_col=0)

In [94]:
dep_var="status"
random_state=42

Sort by median income of job to give a ordering to the jobs

In [95]:
order=cleaned_app_df.groupby("job")["income"].median().sort_values()

In [96]:
cleaned_app_df["job"]=cleaned_app_df["job"].replace(order)

In [97]:
train_df=max_df.merge(cleaned_app_df,how="inner",on="id").drop("id",axis=1)

In [98]:
numerical_transformer = StandardScaler()
categorical_transformer=OneHotEncoder()
ord_transformer=OrdinalEncoder()

In [99]:
X,y=train_df.drop(columns=dep_var),train_df[dep_var]

In [100]:
categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and 
                        X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]


In [101]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('ord',ord_transformer,['job'])
    ])

In [102]:
xs=preprocessor.fit_transform(X)

In [103]:
score_df=pd.DataFrame()

We will be doing cross validation with Kfold =10 and 5 repeats

In [104]:
n_repeats=5
n_splits=10

In [105]:
cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

In [106]:
# dataframe of scores to be converted into plot later
score_df=pd.DataFrame()


* AUC is classification-threshold-invariant. It measures the quality of the model's predictions irrespective of what classification threshold is chosen. 
https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc
* Thus we choose AUC as the banks who use the models can determine their own threshold risk probability for bad users and calculate their expected value/loss for these bad users and give them a approiate interest rate when extending credit to them


In [None]:
classifier=LogisticRegression(max_iter=1000,random_state=random_state)
scores = cross_val_score(classifier, xs, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print(scores)
print(np.array(scores).mean())
score_df["LogisticRegression"]=scores

There is very little linearity in the imbalanced dataset for the linear dividing line of logisitic regression to work on. This can be seen from the low level of correlation in the table below where all variables have correlation between -0.03 and 0.03

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.subplots(figsize=(12,5))
status_correlation=train_df.corr()[["status"]]
sns.heatmap(status_correlation,annot=True,cmap='RdPu')
plt.title('Correlation between the variables')
plt.xticks(rotation=45)

In [None]:
classifier=SVC()
scores = cross_val_score(classifier, xs, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print(scores)
print(np.array(scores).mean())
score_df["SVC"]=scores

K-nearest neighbours

In [None]:
# try different K values
for i in range(1,10):
  classifier=KNeighborsClassifier(i)
  scores = cross_val_score(classifier, xs, y, scoring='roc_auc', cv=cv, n_jobs=-1)
  score_df["KNN"+str(i)]=scores
  print("KNN"+str(i)+"score")
  print(np.array(scores).mean())

Knn with small K (3) is more able to detect outliers in a imbalanced dataset

In [None]:
classifier=DecisionTreeClassifier(random_state=random_state)
scores = cross_val_score(classifier, xs, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print(scores)
print(np.array(scores).mean())
score_df["DecisionTreeClassifier"]=scores

In [None]:
classifier=RandomForestClassifier(random_state=random_state)
scores = cross_val_score(classifier, xs, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print(scores)
print(np.array(scores).mean())
score_df["RandomForestClassifier"]=scores

Random forests are an ensemble of decision trees and typically do better than decision trees

In [None]:
classifier=XGBClassifier(random_state=random_state)
scores = cross_val_score(classifier, xs, y, scoring='roc_auc', cv=cv, n_jobs=-1)

In [None]:
import keras
import tensorflow as tf
from keras.utils.np_utils import to_categorical
from keras import optimizers
from keras.models import Sequential
import keras.layers as layers

In [None]:
#https://stackoverflow.com/questions/61622760/sklearn-model-selection-cross-val-score-in-ann-regression
def create_network():
    model = keras.models.Sequential()
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(2))
    model.compile(optimizer=tf.optimizers.SGD(learning_rate=0.01),
                  loss=tf.keras.losses.BinaryFocalCrossentropy())
    return model

from keras.wrappers.scikit_learn import KerasClassifier
classifier = KerasClassifier(build_fn=create_network, 
                                 epochs=10, 
                                 batch_size=32, 
                                 verbose=1)


In [None]:
#scores = cross_val_score(classifier, xs, y, scoring='roc_auc', cv=cv, n_jobs=-1)

In [None]:
print(scores)
print(np.array(scores).mean())

SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.
https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
https://arxiv.org/abs/1106.1813

In [None]:
# score Dataframe for model with smote 
score_df_smote=pd.DataFrame()

In [None]:
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score

In [None]:
#https://stackoverflow.com/questions/55591063/how-to-perform-smote-with-cross-validation-in-sklearn-in-python
scores=np.array()
for i in range(n_repeats):
  kf = KFold(n_splits=n_splits)
  for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
      X_train = xs[train_index]
      y_train = y[train_index]
      X_test = xs[test_index]
      y_test = y[test_index]
      sm = SMOTE()
      X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
      model = LogisticRegression(max_iter=1000,random_state=random_state)
      model.fit(X_train_oversampled, y_train_oversampled )  
      y_pred = model.predict(X_test)
      scores=np.append(scores,roc_auc_score(y_test,y_pred))

In [None]:
print(np.array(scores).mean())

In [None]:
score_df_smote["LogisticRegression"]=scores

Smote does nothing to help improve the linearity of the dataset for logisitic regression to work

In [None]:
scores=[]
for i in range(1,10):
    for j in range(n_repeats):
        kf = KFold(n_splits=n_splits)
        for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
        X_train = xs[train_index]
        y_train = y[train_index]
        X_test = xs[test_index]
        y_test = y[test_index]
        sm = SMOTE()
        X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
        model = KNeighborsClassifier(i)
        model.fit(X_train_oversampled, y_train_oversampled )  
        y_pred = model.predict(X_test)
        scores.append(roc_auc_score(y_test,y_pred))
    score_df_smote["KNN"+str(i)]=scores
    print("KNN"+str(i)+"score")
    print(np.array(scores).mean())

In [None]:
scores=[]
for i in range(n_repeats):
  kf = KFold(n_splits=n_splits)
  for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
      X_train = xs[train_index]
      y_train = y[train_index]
      X_test = xs[test_index]
      y_test = y[test_index]
      sm = SMOTE()
      X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
      model = DecisionTreeClassifier(random_state=random_state)
      model.fit(X_train_oversampled, y_train_oversampled )  
      y_pred = model.predict(X_test)
      scores.append(roc_auc_score(y_test,y_pred))
score_df_smote["DecisionTreeClassifier"]=scores

In [None]:
scores=[]
for i in range(5):
  kf = KFold(n_splits=10)
  for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
      X_train = xs[train_index]
      y_train = y[train_index]
      X_test = xs[test_index]
      y_test = y[test_index]
      sm = SMOTE()
      X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
      model = RandomForestClassifier(random_state=random_state)
      model.fit(X_train_oversampled, y_train_oversampled )  
      y_pred = model.predict(X_test)
      scores.append(roc_auc_score(y_test,y_pred))
score_df_smote["RandomForestClassifier"]=scores