# Importing libraries

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder

from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

import tensorflow
tensorflow.autograph.set_verbosity(0)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Recall, AUC, Precision
import tensorflow_addons as tfa

# Reading file and tidying

In [2]:
df = pd.read_csv("../dataset/loanprediction.csv")
df.drop("Id", axis=1, inplace=True)
df.columns = ["income", "age", "experience", "marital_status", "house_ownership", "car_ownership", "profession", "city", "state", "current_job_years", "current_house_years", "risk_flag"]
numerical_cols = ["income", "age", "experience", "current_job_years", "current_house_years"]

# Train test split

In [3]:
y = df["risk_flag"]
x = df.drop("risk_flag", axis=1)
x.head()
print(len(x.columns))

11


In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 2021)

# Data preprocessing - Encoding categorical columns and scaling numerical columns

In [5]:
def data_preprocessing(df_x, df_y):
    
    # Label encoding categorical columns with 2 types of categories
    x = df_x.copy()
    label_enc = LabelEncoder()
    x["marital_status"] = label_enc.fit_transform(x["marital_status"])
    x["car_ownership"] = label_enc.fit_transform(x["car_ownership"])
    
    # One Hot Encoding house_ownership column & Combining back to dataframe
    onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
    house_ownership_values = onehot_encoder.fit_transform( x[['house_ownership']] )
    house_ownership_labels = np.array(["norent_noown", "owned", "rented"]).ravel()
    house_ownership_df = pd.DataFrame(house_ownership_values, columns=house_ownership_labels)

    x.reset_index(drop=True, inplace=True)
    house_ownership_df.reset_index(drop=True, inplace=True)
    x = pd.concat([ x, house_ownership_df], axis=1)
    
    x.drop("house_ownership", axis=1, inplace=True)
    
    # Target Encoding the high cardinality categorical columns: profession, city, state
    # https://medium.com/analytics-vidhya/target-encoding-vs-one-hot-encoding-with-simple-examples-276a7e7b3e64
    profession_target_enc = TargetEncoder()
    x["profession_encoded"] = profession_target_enc.fit_transform(x["profession"], df_y)
    city_target_enc = TargetEncoder()
    x["city_encoded"] = city_target_enc.fit_transform(x["city"], df_y)
    state_target_enc = TargetEncoder()
    x["state_encoded"] = state_target_enc.fit_transform(x["state"], df_y)
    x.drop("profession", axis=1, inplace=True)
    x.drop("city", axis=1, inplace=True)
    x.drop("state", axis=1, inplace=True)
    
    # https://stackoverflow.com/questions/51237635/difference-between-standard-scaler-and-minmaxscaler
    # https://www.geeksforgeeks.org/standardscaler-minmaxscaler-and-robustscaler-techniques-ml/
    min_max_scaler = MinMaxScaler()
    x[numerical_cols] = min_max_scaler.fit_transform(x[numerical_cols])
    # need to scale the encoded columns?
    
    return x

In [6]:
x_train = data_preprocessing(x_train, y_train)
x_test = data_preprocessing(x_test, y_test)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [13]:
def run_variation_model(x_train, y_train, x_test, y_test, variation):
    
    # Feature columns
    input_shape = (13,)
    
    if "pca" in variation:
        pca = PCA(0.9, random_state=2021)
        x_train = pca.fit_transform(x_train)
        x_test = pca.fit_transform(x_test)
        
        new_features_len = len(x_train[0])
        input_shape = (new_features_len,)       
        
    if "chi_square" in variation:
        chi_scores = chi2(x_train, y_train)
        p_values = pd.Series(chi_scores[1], index = x_train.columns)
        p_values.sort_values(ascending = False , inplace = True)
#         p_values.plot.bar()
        x_train = x_train.drop(["profession_encoded", "state_encoded", "city_encoded", "income"], axis=1)
        x_test = x_test.drop(["profession_encoded", "state_encoded", "city_encoded", "income"], axis=1)
        
        new_features_len = len(x_train.columns)
        input_shape = (9,)
        
    if "smote" in variation:
        oversampler = SMOTE(random_state=2021)
        x_train, y_train = oversampler.fit_resample(x_train, y_train)        
        
    # Changes shape to 3D
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)    
    
    # Create the model
    model = Sequential()
    model.add(Dense(45, input_shape=input_shape, activation='relu'))
    model.add(Dense(25, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(6, activation='relu'))
    model.add(Flatten())
    model.add(Dense(2, activation='softmax'))

    # Configure the model and start training
    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy', AUC(), Precision(), Recall(), tfa.metrics.FBetaScore(num_classes=2, beta=2.0)])
    model_fit = model.fit(x_train, y_train, epochs=1, batch_size=256, verbose=1, validation_split=0.2)
    test_results = model.evaluate(x_test, y_test, verbose=1)
    
    return f"-------------------------TEST SCORES for {variation}-----------------------\nRecall: {test_results[4]}\nPrecision: {test_results[3]}\nF2-Score: {test_results[5][0]}\nAccuracy: {test_results[1]}\nAUC Score: {test_results[2]}\n"

In [14]:
variations = ["base", "smote", "chi_square", "pca", "smote, chi_square", "smote, pca"]
results = ""

for variation in variations:
    result = run_variation_model(x_train, y_train, x_test, y_test, variation)
    results += result

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export A

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export A

In [15]:
print(results)

-------------------------TEST SCORES for base-----------------------
Recall: 0.8770039677619934
Precision: 0.8770039677619934
F2-Score: 0.972716212272644
Accuracy: 0.8770039677619934
AUC Score: 0.888079047203064
-------------------------TEST SCORES for smote-----------------------
Recall: 0.8770039677619934
Precision: 0.8770039677619934
F2-Score: 0.972716212272644
Accuracy: 0.8770039677619934
AUC Score: 0.8904247283935547
-------------------------TEST SCORES for chi_square-----------------------
Recall: 0.8770039677619934
Precision: 0.8770039677619934
F2-Score: 0.972716212272644
Accuracy: 0.8770039677619934
AUC Score: 0.8860371708869934
-------------------------TEST SCORES for pca-----------------------
Recall: 0.8770039677619934
Precision: 0.8770039677619934
F2-Score: 0.972716212272644
Accuracy: 0.8770039677619934
AUC Score: 0.8881022930145264
-------------------------TEST SCORES for smote, chi_square-----------------------
Recall: 0.8770039677619934
Precision: 0.8770039677619934
F2-S