<a href="https://colab.research.google.com/github/johntanas/it1244project/blob/main/code/model_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RepeatedStratifiedKFold,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score,average_precision_score,SCORERS

In [19]:
n_repeats=1
n_splits=5

In [4]:
path_to_max_label="/content/max_label.csv"
path_to_app="/content/cleaned_application.csv"
cleaned_app_df=pd.read_csv(path_to_app,index_col=0)
max_df=pd.read_csv(path_to_max_label,index_col=0)

In [5]:
dep_var="status"
random_state=42

Sort by median income of job to give a ordering to the jobs

In [6]:
order=cleaned_app_df.groupby("job")["income"].median().sort_values()

In [7]:
cleaned_app_df["job"]=cleaned_app_df["job"].replace(order)

In [8]:
train_df=max_df.merge(cleaned_app_df,how="inner",on="id").drop("id",axis=1)

In [9]:
numerical_transformer = StandardScaler()
categorical_transformer=OneHotEncoder()
ord_transformer=OrdinalEncoder()

In [10]:
X,y=train_df.drop(columns=dep_var),train_df[dep_var]

In [11]:
categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and 
                        X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]


In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('ord',ord_transformer,['job'])
    ])

In [13]:
xs=preprocessor.fit_transform(X)

In [14]:
score_df=pd.DataFrame()

We will be doing cross validation with StratifiedKFold =5 and 5 repeats.
We use StratifiedKFold instead of Kfold to perserve  the percentage of samples for each class.

In [15]:
cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)

In [16]:
# dataframe of scores to be converted into plot later
score_df=pd.DataFrame()

In [24]:
import keras
import tensorflow as tf
from keras.utils.np_utils import to_categorical
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from sklearn.utils.class_weight import compute_class_weight

In [21]:
def scheduler(epoch, lr):
          if epoch < 10:
            return lr
          else:
            return lr * tf.math.exp(-0.1)
schedule = tf.keras.callbacks.LearningRateScheduler(scheduler)
earlystop=keras.callbacks.EarlyStopping(patience=5,restore_best_weights=True)
callbacks=[schedule,earlystop]
#Adam > SGD
opt = tf.optimizers.Adam(learning_rate=0.001)
#Use BinaryFocalCrossentropy instead of BinaryCrossentropy
loss= tf.keras.losses.BinaryCrossentropy()
def createModel():
    model= keras.Sequential([
      keras.layers.Dense(1024,activation="relu"),
      keras.layers.Dropout(0.2),
      keras.layers.Dense(512,activation="relu"),
      keras.layers.Dropout(0.2),
      keras.layers.Dense(256,activation="relu"),
      keras.layers.Dense(128,activation="relu"),
      keras.layers.Dense(1,activation="sigmoid")])
    return model

In [None]:
bestscore=0
bestmodel=None
bestmodel_val_data=None
scores=[]
for train_index, test_index in cv.split(xs,y):
      X_train = xs[train_index]
      y_train = y[train_index]
      X_test = xs[test_index]
      y_test = y[test_index]
      model= createModel()
      class_w= dict(zip(np.unique(y_train),compute_class_weight("balanced",classes=np.unique(y_train),y=y_train)))
      model.compile(loss=loss,optimizer=opt, metrics=[tf.keras.metrics.AUC(curve='ROC')])
      history=model.fit(X_train, y_train,batch_size=64, epochs=100,validation_data=(X_test,y_test),verbose=1,callbacks=callbacks,class_weight=class_w)
      y_pred = model.predict(X_test,verbose=0)
      score=roc_auc_score(y_test,y_pred[:,1])
      scores.append(score)
score_df["NN"]=scores

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
 66/456 [===>..........................] - ETA: 8s - loss: 0.6795 - auc_8: 0.5769

In [None]:
y_test,X_test=bestmodel_val_data[0],bestmodel_val_data[1]
preds=bestmodel.predict(X_test)

In [None]:
bestmodel.summary()

In [None]:
preds.min(axis=0),preds.max(axis=0)

In [None]:
bestscore=0
bestmodel=None
# we stop at k ==11 as it seems that the knn auc roc score drops off at k=7
for i in range(1,12,2):
    scores=[]
    for train_index, test_index in cv.split(xs,y):
        X_train = xs[train_index]
        y_train = y[train_index]
        X_test = xs[test_index]
        y_test = y[test_index]
        sm = SMOTE()
        X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
        model = KNeighborsClassifier(i)
        model.fit(X_train_oversampled, y_train_oversampled)  
        y_pred = model.predict(X_test)
        score=roc_auc_score(y_test,y_pred)
        scores.append(score)
        if score>bestscore:
            bestmodel=model
            bestscore=score
            print(f"KNN {i} with best score of {score}")
    score_df_smote["KNN"+str(i)]=scores
    print(f"KNN {i} with best score of {score}")

Smote works by using a random example from the minority class which is first chosen. Then k of the nearest neighbors for that example are found (k=5 is the default value which is used here). A randomly selected neighbor is chosen and a synthetic example is created at a randomly selected point between the two examples in feature space. Thus, smote is essentially overfitting our KNN classifier
https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/

In [None]:
bestscore=0
bestmodel=None
scores=[]
for train_index, test_index in cv.split(xs,y):
      X_train = xs[train_index]
      y_train = y[train_index]
      X_test = xs[test_index]
      y_test = y[test_index]
      sm = SMOTE()
      X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
      model = DecisionTreeClassifier(random_state=random_state)
      model.fit(X_train_oversampled, y_train_oversampled )  
      y_pred = model.predict(X_test)
      score=roc_auc_score(y_test,y_pred)
      scores.append(score)
      if score>bestscore:
            bestmodel=model
            bestscore=score
            print(f"DecisionTreeClassifier with best score of {score}")
score_df_smote["DecisionTreeClassifier"]=scores
print(f"DecisionTreeClassifier with avg score of {np.array(scores).mean()}")

In [None]:
bestscore=0
bestmodel=None
scores=[]
for train_index, test_index in cv.split(xs,y):
      X_train = xs[train_index]
      y_train = y[train_index]
      X_test = xs[test_index]
      y_test = y[test_index]
      sm = SMOTE()
      X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
      model = RandomForestClassifier(random_state=random_state)
      model.fit(X_train_oversampled, y_train_oversampled)  
      y_pred = model.predict(X_test)
      score=roc_auc_score(y_test,y_pred)
      scores.append(score)
      if score>bestscore:
            bestmodel=model
            bestscore=score
            print(f"RandomForestClassifier with best score of {score}")
score_df_smote["RandomForestClassifier"]=scores
print(f"RandomForestClassifier with avg score of {np.array(scores).mean()}")

Surpisingly, smote actually deproves auc score across the trees models but improves knn , suggesting that the orginial data is quite noisy, and creating sythenic data will only add on to the noise.

In [None]:
score_df_smote.to_csv("score_smote.csv")

Thus,we could also try totem links on knn and rf.

In [None]:
from imblearn.under_sampling import TomekLinks

In [None]:
#totem link dataframe
score_df_tl=pd.DataFrame()

In [None]:
bestscore=0
bestmodel=None
scores=[]
# we stop at k ==9 as it seems that the knn auc roc score drops off at k=7
for i in range(1,10,2):
    scores=[]
    for train_index, test_index in cv.split(xs,y):
        X_train = xs[train_index]
        y_train = y[train_index]
        X_test = xs[test_index]
        y_test = y[test_index]
        tl = TomekLinks()
        X_train_undersampled, y_train_undersampled = tl.fit_resample(X_train, y_train)
        model = KNeighborsClassifier(i)
        model.fit(X_train_undersampled, y_train_undersampled)  
        y_pred = model.predict(X_test)
        score=roc_auc_score(y_test,y_pred)
        scores.append(score)
        if score>bestscore:
            bestmodel=model
            bestscore=score
            print(f"KNN {i} with best score of {score}")
    score_df_tl["KNN"+str(i)]=scores
    print(f"KNN {i} with best score of {score}")

In [None]:
bestscore=0
bestmodel=None
scores=[]
for train_index, test_index in cv.split(xs,y):
      X_train = xs[train_index]
      y_train = y[train_index]
      X_test = xs[test_index]
      y_test = y[test_index]
      sm = SMOTE()
      X_train_undersampled, y_train_undersampled = tl.fit_resample(X_train, y_train)
      model = RandomForestClassifier(random_state=random_state)
      model.fit(X_train_undersampled, y_train_undersampled)   
      y_pred = model.predict(X_test)
      score=roc_auc_score(y_test,y_pred)
      scores.append(score)
      if score>bestscore:
            bestmodel=model
            bestscore=score
            print(f"RandomForestClassifier with best score of {score}")
score_df_tl["RandomForestClassifier"]=scores
print(f"RandomForestClassifier with avg score of {np.array(scores).mean()}")

In [None]:
score_df_tl.to_csv("score_tl.csv")