In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from numpy import loadtxt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

In [8]:
# Reading data
loan_data = pd.read_csv("/content/loan_data_ADA_assignment.csv")
loan_data = loan_data.replace(r'^\s*$', np.nan, regex=True)

In [9]:
# Dropping non-required columns
loan_data.drop(columns=['id', 'member_id', 'sub_grade', 'emp_title', 'emp_length', 'issue_d', 'pymnt_plan', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'inq_last_6mths', 'earliest_cr_line', 'mths_since_last_delinq', 'mths_since_last_record', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'policy_code', 'last_credit_pull_d', 'acc_now_delinq', 'next_pymnt_d', 'home_ownership', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'total_credit_rv'], inplace=True)

In [10]:
# Converting categorical variables to factors
loan_data['grade'] = loan_data['grade'].map({'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7})

In [11]:
# One-hot encoding for categorical variables
loan_data = pd.get_dummies(loan_data, columns=['verification_status', 'loan_status'])

In [12]:
# Handling missing values by replacing with mode
loan_data['tot_coll_amt'].fillna(loan_data['tot_coll_amt'].mode()[0], inplace=True)
loan_data['tot_cur_bal'].fillna(loan_data['tot_cur_bal'].mode()[0], inplace=True)

# Converting and replacing %
loan_data['revol_util'] = loan_data['revol_util'].fillna(0)
loan_data['revol_util'] = loan_data['revol_util'].astype(str).str.replace('%', '').astype(float)
loan_data['revol_util'] = loan_data['revol_util'].astype(str).str.replace('%', '').astype(float).astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  loan_data['tot_coll_amt'].fillna(loan_data['tot_coll_amt'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  loan_data['tot_cur_bal'].fillna(loan_data['tot_cur_bal'].mode()[0], inplace=True)


In [13]:
#Standardizing using z-scores
columns_to_standardize = [
    'loan_amnt', 'funded_amnt', 'funded_amnt_inv','int_rate', 'installment',
    'annual_inc', 'dti', 'revol_bal', 'total_pymnt', 'total_pymnt_inv',
    'total_rec_prncp', 'total_rec_int', 'last_pymnt_amnt', 'tot_cur_bal',
    'term', 'open_acc', 'revol_util', 'total_acc', 'tot_coll_amt'
]

# Initialize the StandardScaler
scaler = StandardScaler()

# Loop through columns and replace each one with standardized values
for col in columns_to_standardize:
    col_values = loan_data[[col]]
    standardized_values = scaler.fit_transform(col_values)
    loan_data[col] = standardized_values.flatten()  # Overwrite original column with standardized values


In [14]:
# Splitting data into training and test sets
X = loan_data.drop(columns=['loan_is_bad']).astype('float')
y = loan_data['loan_is_bad'].astype('float')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

over_sampler = SMOTE(sampling_strategy=0.5, random_state=1)
under_sampler = RandomUnderSampler(sampling_strategy=0.5, random_state=1)
X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)
X_train_balanced, y_train_balanced = under_sampler.fit_resample(X_train_over, y_train_over)
X_train_balanced, y_train_balanced = under_sampler.fit_resample(X_train, y_train)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train_balanced, y_train_balanced))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

# Encoding target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [15]:
# One-hot encoding for target variable
#one_hot_encoder = OneHotEncoder()
#y_train_balanced_encoded_one_hot = one_hot_encoder.fit_transform(y_train.reshape(-1, 1)).toarray()
#y_test_encoded_one_hot = one_hot_encoder.transform(y_test.reshape(-1, 1)).toarray()

train_batch = train_dataset.batch(500)
features, labels = next(iter(train_batch))
#features

model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation=tf.nn.relu, input_shape=(32,)),
    tf.keras.layers.Dense(16, activation=tf.nn.relu),
    tf.keras.layers.Dense(16, activation=tf.nn.relu),
    tf.keras.layers.Dense(8, activation=tf.nn.relu),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.01,  weight_decay=0.5, momentum = 0.9),
              loss=tf.keras.losses.BinaryCrossentropy(), #loss function as cross entropy
              metrics=['accuracy', 'precision', 'recall'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
#Model training
model.fit(features, labels, epochs=100)

test_batch = test_dataset.batch(1000)  # the whole dataset
test_features, test_labels = next(iter(test_batch))

y_pred = model.predict(test_features)


Epoch 1/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 48ms/step - accuracy: 0.2854 - loss: 0.8381 - precision: 0.0000e+00 - recall: 0.0000e+00
Epoch 2/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.4368 - precision: 0.0000e+00 - recall: 0.0000e+00 
Epoch 3/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.2779 - precision: 0.0000e+00 - recall: 0.0000e+00 
Epoch 4/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.1939 - precision: 0.0000e+00 - recall: 0.0000e+00 
Epoch 5/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.1456 - precision: 0.0000e+00 - recall: 0.0000e+00 
Epoch 6/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.1134 - precision: 0.0000e+00 - recall: 0.0000e+00 
Epoch 7/10

In [17]:
# Convert probabilities to binary predictions
y_pred_binary = (y_pred > 0.1).astype(int)

# Flatten the true labels if they are not already flattened
test_labels_flat = np.ravel(test_labels)

# Compute precision, recall, and F1-score
precision = precision_score(test_labels_flat, y_pred_binary)
recall = recall_score(test_labels_flat, y_pred_binary)
#f1 = f1_score(test_labels_flat, y_pred_binary)

print('Precision:', precision)
print('Recall:', recall)
#print('F1-score:', f1)

for pred, real in zip(y_pred, test_labels):
   print(f"Predicted: {pred[0]};    Real: {real}")

#print confusion matrix
conf_matrix = confusion_matrix(test_labels_flat, y_pred_binary)
print('Confusion Matrix:')
print(conf_matrix)

# Evaluate the model on the test set
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(test_features, test_labels, verbose=2)

print('\nTest Accuracy:', test_accuracy)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision: 0.0
Recall: 0.0
Predicted: 0.04858645051717758;    Real: 0.0
Predicted: 0.03714752569794655;    Real: 0.0
Predicted: 0.029822418466210365;    Real: 0.0
Predicted: 0.02150643989443779;    Real: 0.0
Predicted: 0.006924881599843502;    Real: 0.0
Predicted: 0.031906746327877045;    Real: 0.0
Predicted: 0.020370911806821823;    Real: 0.0
Predicted: 0.008991843089461327;    Real: 1.0
Predicted: 0.0120429378002882;    Real: 0.0
Predicted: 0.02139572985470295;    Real: 0.0
Predicted: 0.011918429285287857;    Real: 0.0
Predicted: 0.012234261259436607;    Real: 0.0
Predicted: 0.006330637726932764;    Real: 0.0
Predicted: 0.010951547883450985;    Real: 0.0
Predicted: 0.021037571132183075;    Real: 0.0
Predicted: 0.03131885826587677;    Real: 0.0
Predicted: 0.021650806069374084;    Real: 0.0
Predicted: 0.020500175654888153;    Real: 0.0
Predicted: 0.003269077045843005;    Real: 0.0
Predicted: 0.011495153419673443;    Real: 0.0
Predicted: 0.020113039761781693;    Real: 0.0
Predicted: 0.0