<a href="https://colab.research.google.com/github/johntanas/it1244project/blob/main/code/model_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [82]:
path_to_max_label="/content/max_label.csv"
path_to_app="/content/cleaned_application_no_drop.csv"
cleaned_app_df=pd.read_csv(path_to_app,index_col=0)
max_df=pd.read_csv(path_to_max_label,index_col=0)

In [83]:
dep_var="status"
random_state=42

Sort by median income of job to give a ordering to the jobs

In [86]:
order=cleaned_app_df.groupby("job")["income"].median().sort_values()

In [87]:
cleaned_app_df["job"]=cleaned_app_df["job"].replace(order)

In [9]:
train_df=max_df.merge(cleaned_app_df,how="inner",on="id").drop("id",axis=1)

In [88]:
numerical_transformer = StandardScaler()
categorical_transformer=OneHotEncoder()
ord_transformer=OrdinalEncoder()

In [89]:
X,y=train_df.drop(columns=dep_var),train_df[dep_var]

In [90]:
categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and 
                        X[cname].dtype == "object"]
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]


In [91]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('ord',ord_transformer,['job'])
    ])

In [93]:
xs=preprocessor.fit_transform(X)

In [94]:
X_train, X_test, y_train, y_test = train_test_split(xs, y,random_state=random_state)

In [95]:
classifiers = {
    "LogisticRegression" : LogisticRegression(max_iter=1000,random_state=random_state),
    "KNeighbors" : KNeighborsClassifier(3),
    "SVC" : SVC(random_state=random_state),
    "DecisionTree" : DecisionTreeClassifier(random_state=random_state),
    "RandomForest" : RandomForestClassifier(random_state=random_state),
    "XGBoost" : XGBClassifier(random_state=random_state)
}

In [96]:
for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    prediction = classifier.predict(X_test)
    print("Model "+key)
    print(classification_report(y_test, prediction,zero_division=0))

Model LogisticRegression
              precision    recall  f1-score   support

           0       0.88      1.00      0.94      8045
           1       0.00      0.00      0.00      1070

    accuracy                           0.88      9115
   macro avg       0.44      0.50      0.47      9115
weighted avg       0.78      0.88      0.83      9115

Model KNeighbors
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      8045
           1       0.41      0.28      0.34      1070

    accuracy                           0.87      9115
   macro avg       0.66      0.61      0.63      9115
weighted avg       0.85      0.87      0.86      9115

Model SVC
              precision    recall  f1-score   support

           0       0.88      1.00      0.94      8045
           1       0.00      0.00      0.00      1070

    accuracy                           0.88      9115
   macro avg       0.44      0.50      0.47      9115
weighted avg       0.7

In [97]:
import keras
import tensorflow as tf
from keras.utils.np_utils import to_categorical
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten


In [98]:
model=keras.Sequential([
keras.layers.Dense(1024),
keras.layers.Activation("sigmoid"),
keras.layers.Dense(512),
keras.layers.Activation(activation="relu"),
keras.layers.Dense(2),
keras.layers.Activation(activation="softmax")])

In [99]:
one_hot_y=pd.get_dummies(y_train)
one_hot_valid_y=pd.get_dummies(y_test)

In [101]:
opt = tf.optimizers.SGD(learning_rate=0.01)
model.compile(loss=tf.keras.losses.BinaryFocalCrossentropy(), optimizer=opt, metrics=['accuracy',tf.keras.metrics.Recall()])
epochs = 10
history=model.fit(X_train, one_hot_y,batch_size=32, epochs=epochs,validation_data=(X_test,one_hot_valid_y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: ignored

In [None]:
y_prob = model.predict(X_test)
prediction = y_prob.argmax(axis=-1)
print("Model NN")
print(classification_report(y_test,prediction,zero_division=0))

In [102]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_balanced, y_balanced = oversample.fit_resample(X_train, y_train)

In [103]:
for key, classifier in classifiers.items():
    classifier.fit(X_balanced, y_balanced)
    prediction = classifier.predict(X_test)
    print("Model "+key)
    print(classification_report(y_test, prediction))

Model LogisticRegression
              precision    recall  f1-score   support

           0       0.90      0.54      0.67      8045
           1       0.14      0.54      0.22      1070

    accuracy                           0.54      9115
   macro avg       0.52      0.54      0.45      9115
weighted avg       0.81      0.54      0.62      9115

Model KNeighbors
              precision    recall  f1-score   support

           0       0.92      0.88      0.90      8045
           1       0.33      0.45      0.38      1070

    accuracy                           0.83      9115
   macro avg       0.63      0.66      0.64      9115
weighted avg       0.85      0.83      0.84      9115

Model SVC
              precision    recall  f1-score   support

           0       0.90      0.59      0.72      8045
           1       0.15      0.52      0.23      1070

    accuracy                           0.59      9115
   macro avg       0.52      0.56      0.47      9115
weighted avg       0.8

In [104]:
one_hot_y_balanced=pd.get_dummies(y_balanced)

In [107]:
model2=keras.Sequential([
keras.layers.Dense(1024),
keras.layers.Activation("sigmoid"),
keras.layers.Dense(512),
keras.layers.Activation(activation="relu"),
keras.layers.Dense(2),
keras.layers.Activation(activation="softmax")])

In [None]:
opt = tf.optimizers.SGD(learning_rate=0.01)
model2.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=opt, metrics=['accuracy',tf.keras.metrics.Recall()])
epochs = 10
history=model2.fit(X_balanced,one_hot_y_balanced,batch_size=32, epochs=epochs,validation_data=(X_test,one_hot_valid_y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

In [None]:
y_prob=model2.predict(X_test)
prediction = y_prob.argmax(axis=-1)
print("Model Neural Net")
print(classification_report(y_test,prediction,zero_division=0))