In [2]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [3]:
#import dependencies
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

pd.set_option('display.max_columns', None)

In [7]:
#load in the data
application_df = pd.read_csv('application_record.csv')
credit_record_df = pd.read_csv("credit_record.csv")

In [8]:
application_df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [9]:
credit_record_df.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [10]:
credit_df = pd.merge(application_df, credit_record_df, on='ID', how='inner')

In [9]:
# 0: 1-29 days past due
# 1: 30-59 days past due
# 2: 60-89 days overdue
# 3: 90-119 days overdue
# 4: 120-149 days overdue
# 5: Overdue or bad debts, write-offs for more than 150 days
# C: paid off that month
# X: No loan for the month

In [12]:
credit_df['target'] = credit_df['STATUS'].apply(lambda x: 0 if x in ['0','1','2', '3', '4', '5'] else 1)

In [13]:
credit_df.drop(columns=['ID',"STATUS"], inplace=True)
credit_df.head()

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,MONTHS_BALANCE,target
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,0,1
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-1,1
2,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-2,1
3,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-3,1
4,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-4,1


begin model training

In [14]:
df = credit_df.copy()

df = pd.get_dummies(df, columns=['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE'], drop_first=True)

df.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,MONTHS_BALANCE,target,CODE_GENDER_M,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_Y,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff
0,0,427500.0,-12005,-4542,1,1,0,0,2.0,0,1,True,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,0,427500.0,-12005,-4542,1,1,0,0,2.0,-1,1,True,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,0,427500.0,-12005,-4542,1,1,0,0,2.0,-2,1,True,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,0,427500.0,-12005,-4542,1,1,0,0,2.0,-3,1,True,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,0,427500.0,-12005,-4542,1,1,0,0,2.0,-4,1,True,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [15]:
y = df['target']

X = df.drop(columns=['target'])

In [16]:
y.head()

Unnamed: 0,target
0,1
1,1
2,1
3,1
4,1


In [17]:
X.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,MONTHS_BALANCE,CODE_GENDER_M,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_Y,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff
0,0,427500.0,-12005,-4542,1,1,0,0,2.0,0,True,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,0,427500.0,-12005,-4542,1,1,0,0,2.0,-1,True,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,0,427500.0,-12005,-4542,1,1,0,0,2.0,-2,True,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,0,427500.0,-12005,-4542,1,1,0,0,2.0,-3,True,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,0,427500.0,-12005,-4542,1,1,0,0,2.0,-4,True,True,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


train_test_split

In [18]:
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

Linear Regression Model

In [None]:
LogRegModel = LogisticRegression(random_state=1)

LogRegModel.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
test_predictions = LogRegModel.predict(x_test)

In [None]:
LogRegCM = confusion_matrix(y_test, test_predictions)

df_LogRegDM = pd.DataFrame(
    LogRegCM,
    columns=['Predicted 0', 'Predicted 1'],
    index=['Actual 0', 'Actual 1']
)

df_LogRegDM.head()

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,62369,13188
Actual 1,9494,109378


In [None]:
LogRegCR = classification_report(y_test, test_predictions)

print('Confusion Matrix')
display(df_LogRegDM)
print('Classification Report')
print(LogRegCR)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,62369,13188
Actual 1,9494,109378


Classification Report
              precision    recall  f1-score   support

           0       0.87      0.83      0.85     75557
           1       0.89      0.92      0.91    118872

    accuracy                           0.88    194429
   macro avg       0.88      0.87      0.88    194429
weighted avg       0.88      0.88      0.88    194429



The logistic regression model does a good job predicting both 0 (bad) consumers and 1 (good) consumers.

Good Consumers (1): The model is 83% accurate here.
Bad Consumers (0): The model is 92% accurate here

The model correctly identifies 92% of all good consumers. Out of the consumers it predicts as bad, 87% are actually bad. It balances these two measures well, with an overall score of 88% for bad consumers. Overall, the model is 88% accurate, and it handles both consumer types well. There is room for improvement to train the model to achieve a higher accuracy score.

# Neural Network


preprocessing

In [19]:
y = df.target.values
X = df.drop(columns=["target"])

In [20]:
x_train, x_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=1,
    stratify=y
)

In [21]:
scaler = StandardScaler()

x_scaler = scaler.fit(x_train)


x_train_scaled = x_scaler.transform(x_train)
x_test_sclaed = x_scaler.transform(x_test)

In [22]:
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=16, activation="relu", input_dim=len(df.columns)-1))
nn_model.add(tf.keras.layers.Dense(units=16, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [23]:
nn_model.summary()

In [24]:
fit_model = nn_model.fit(x_train_scaled, y_train, epochs=10)

Epoch 1/10
[1m18228/18228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 2ms/step - accuracy: 0.6218 - loss: 0.6581
Epoch 2/10
[1m18228/18228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2ms/step - accuracy: 0.6359 - loss: 0.6456
Epoch 3/10
[1m18228/18228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 2ms/step - accuracy: 0.6380 - loss: 0.6418
Epoch 4/10
[1m18228/18228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 2ms/step - accuracy: 0.6419 - loss: 0.6379
Epoch 5/10
[1m18228/18228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2ms/step - accuracy: 0.6431 - loss: 0.6368
Epoch 6/10
[1m18228/18228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2ms/step - accuracy: 0.6446 - loss: 0.6352
Epoch 7/10
[1m18228/18228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 1ms/step - accuracy: 0.6465 - loss: 0.6339
Epoch 8/10
[1m18228/18228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 2ms/step - accuracy: 0.6460 - loss: 0.6335


In [25]:
model_loss, model_accuracy = nn_model.evaluate(x_test_sclaed,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

6076/6076 - 9s - 1ms/step - accuracy: 0.6465 - loss: 0.6331
Loss: 0.6331110596656799, Accuracy: 0.6465033292770386


In [26]:
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=len(df.columns)-1))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [27]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

In [None]:
tuner.search(x_train_scaled,y_train,epochs=20,validation_data=(x_test_sclaed,y_test))

Trial 18 Complete [00h 11m 07s]
val_accuracy: 0.6366437077522278

Best val_accuracy So Far: 0.6400485634803772
Total elapsed time: 01h 20m 16s

Search: Running Trial #19

Value             |Best Value So Far |Hyperparameter
sigmoid           |tanh              |activation
3                 |7                 |first_units
5                 |6                 |num_layers
5                 |3                 |units_0
7                 |1                 |units_1
3                 |1                 |units_2
9                 |1                 |units_3
7                 |1                 |units_4
7                 |1                 |units_5
7                 |20                |tuner/epochs
0                 |7                 |tuner/initial_epoch
1                 |2                 |tuner/bracket
0                 |2                 |tuner/round

Epoch 1/7
[1m18228/18228[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 3ms/step - accuracy: 0.6133 - loss: 0.6638 - val_accuracy: 