# Loan Prediction Project: Neural Network Model

Some basic notes about the NN we are creating here
 - We are splitting the train and test set 80:20
 - This is a with a 22x3 NN with a learning rate of 5

## Libraries used

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score

dataset = pd.read_csv("train_u6lujuX_CVtuZ9i.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


## Data preprocessing

In [14]:
# removing all the datapoints that have null values
nan_value = float("NaN")
dataset.replace('', nan_value, inplace=True)
dataset.dropna(inplace=True)
print("Dimensions of the dataset : ", dataset.shape)

# one-hot encoding all the categorical values
g = dataset.select_dtypes(include=['object'])
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
encodedFeatures = onehot_encoder.fit_transform(g).toarray()
encodedFeatures.shape

Dimensions of the dataset :  (480, 13)


(480, 497)

## Neural Network

In [15]:
X = dataset.drop(columns=['Loan_Status'])
Y = pd.get_dummies(dataset['Loan_Status'])

In [16]:
#get numerical features and scale them
X_nums = dataset.select_dtypes(exclude=['object'])
minmax_scaler = MinMaxScaler()

X_nums_scaled = minmax_scaler.fit_transform(X_nums)

print(X_nums.shape)
print(X_nums_scaled.shape)
print(X_nums_scaled[0])

(480, 5)
(480, 5)
[0.05482993 0.0445666  0.20135364 0.72972973 1.        ]


In [17]:
left = pd.DataFrame(encodedFeatures)
right = pd.DataFrame(X_nums_scaled)

# Join categorical and numerical Data Frames
X_scaled = left.join(right, lsuffix='_left', rsuffix='_right')

# Split and shuffle data 80:20
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, 
                                                    test_size=0.3, random_state=21)

# y_test.apply(pd.to_numeric, errors='coerce')
# y_test.fillna(0, inplace=True)

print(X_train.shape)
print(y_test.shape)

(336, 502)
(144, 2)


In [20]:
# fit to MLP classifier
mlp = MLPClassifier(solver='sgd', random_state=21, activation='relu',
                   learning_rate_init=0.05, batch_size = 24, hidden_layer_sizes=(12,1), max_iter=200)
mlp

mlp.fit(X_train, y_train)
predictions = mlp.predict(X_test)

In [21]:
print("Accuracy : ", accuracy_score(y_test, predictions))
print("Mean Square Error : ", mean_squared_error(y_test, predictions))

print("Confusion Matrix for each label : ")
print(multilabel_confusion_matrix(y_test, predictions))

print("Classification Report : ")
print(classification_report(y_test, predictions))

Accuracy :  1.0
Mean Square Error :  0.0
Confusion Matrix for each label : 
[[[92  0]
  [ 0 52]]

 [[52  0]
  [ 0 92]]]
Classification Report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        52
           1       1.00      1.00      1.00        92

   micro avg       1.00      1.00      1.00       144
   macro avg       1.00      1.00      1.00       144
weighted avg       1.00      1.00      1.00       144
 samples avg       1.00      1.00      1.00       144



In [None]:
# Grid Search to optimize algorithm

max_iter_test = [200, 500, 800]
hidden_layer_sizes_test = [(a, b) for a in 12 * np.arange(1,5) for b in 1 * np.arange(1, 5)]

learning_rates = 0.05 * np.arange(1,5)

param_grid_test = dict(learning_rate_init=learning_rates, hidden_layer_sizes=hidden_layer_sizes_test,
                 max_iter=max_iter_test)

grid = GridSearchCV(estimator=mlp, param_grid=param_grid_test)

grid.fit(X_scaled, Y)