# Team 8: Loan Prediction Project

# About the data set
1. **Loan ID (ID)**
2. Gender (G)
3. Dependents (D)
4. Education (E)
5. Self-Emloyed (SF)
6. Applicant Income (AI)
7. Co-applicant Income (CI)
8. Loan Amount (LA)
9. Loan Amount Term (LT)
10. Credit History (CH)
11. Property Area (PA)
12. Loan Status (LS)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder

In [4]:
dataset = pd.read_csv("loan_data.csv")
dataset.info()

print("Dataset :")
print(dataset.head())
print("Loan Status : ")
print(dataset['Loan_Status'].unique())

print("Dimensions of the dataset : ", dataset.shape)
print("Features of the dataset :")
print(dataset.describe(include = 'all'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
Dataset :
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No       

In [12]:
dataset.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


## Cleaning the data

In [61]:
# removing all the datapoints that have null values
# this would leave us with ~78.18% of the orginal data
nan_value = float("NaN")
dataset.replace('', nan_value, inplace=True)
dataset.dropna(inplace=True)

# one-hot encoding the categorical values
g = dataset.select_dtypes(include=['object'])
onehot_encoder = OneHotEncoder(handle_unknown='ignore')
encodedFeatures = onehot_encoder.fit_transform(g).toarray()
encodedFeatures.shape

#encoded = pd.DataFrame().toarray())


# one hot encoder sklearn
# scaler 

#dataset[['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']] = pd.DataFrame(onehot_encoder.fit_transform(dataset[['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']]).toarray())

(480, 17)

In [62]:
encodedVals[0]

array([0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1., 0.])

# Neural Network

In [85]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

X = dataset.drop(columns=['Loan_Status'])
Y = pd.get_dummies(dataset['Loan_Status'])



# splitting the data 80:20
train, test = train_test_split(dataset, test_size=0.2, random_state=21)


In [86]:

#get numerical features and scale them
X_nums = dataset.select_dtypes(exclude=['object'])
minmax_scaler = MinMaxScaler()

X_nums_scaled = minmax_scaler.fit_transform(X_nums)

X_nums.shape

(480, 5)

In [87]:
X_nums_scaled.shape

(480, 5)

In [88]:
X_nums_scaled[0]


array([0.05482993, 0.0445666 , 0.20135364, 0.72972973, 1.        ])

In [89]:
left = pd.DataFrame(encodedFeatures)
right = pd.DataFrame(X_nums_scaled)

#Join categorical and numerical Data Frames
X_scaled = left.join(right, lsuffix='_left', rsuffix='_right')

#Split and shuffle data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, 
                                                    test_size=0.2, random_state=21)

#y_test.apply(pd.to_numeric, errors='coerce')
#y_test.fillna(0, inplace=True)

print(X_train.shape)
print(y_test.shape)

(384, 22)
(96, 2)


In [95]:
#fit to MLP classifier
mlp = MLPClassifier(solver='sgd', random_state=21, activation='relu',
                   learning_rate_init=0.05, batch_size = 24, hidden_layer_sizes=(12,1), max_iter=200)
mlp

MLPClassifier(batch_size=24, hidden_layer_sizes=(12, 1),
              learning_rate_init=0.05, random_state=21, solver='sgd')

In [96]:
mlp.fit(X_train, y_train)

predictions = mlp.predict(X_test)

In [97]:
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score

print("Accuracy : ", accuracy_score(y_test, predictions))
print("Mean Square Error : ", mean_squared_error(y_test, predictions))

print("Confusion Matrix for each label : ")
print(multilabel_confusion_matrix(y_test, predictions))

print("Classification Report : ")
print(classification_report(y_test, predictions))

Accuracy :  1.0
Mean Square Error :  0.0
Confusion Matrix for each label : 
[[[61  0]
  [ 0 35]]

 [[35  0]
  [ 0 61]]]
Classification Report : 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        61

   micro avg       1.00      1.00      1.00        96
   macro avg       1.00      1.00      1.00        96
weighted avg       1.00      1.00      1.00        96
 samples avg       1.00      1.00      1.00        96



In [93]:
#Grid Search to optimize algorithm

max_iter_test = [200, 500, 800]
hidden_layer_sizes_test = [(a, b) for a in 12 * np.arange(1,5) for b in 1 * np.arange(1, 5)]

learning_rates = 0.05 * np.arange(1,5)

param_grid_test = dict(learning_rate_init=learning_rates, hidden_layer_sizes=hidden_layer_sizes_test,
                 max_iter=max_iter_test)

grid = GridSearchCV(estimator=mlp, param_grid=param_grid_test)

grid.fit(X_scaled, Y)

GridSearchCV(estimator=MLPClassifier(batch_size=24, hidden_layer_sizes=(12, 3),
                                     learning_rate_init=0.03, max_iter=500,
                                     random_state=21, solver='sgd'),
             param_grid={'hidden_layer_sizes': [(12, 1), (12, 2), (12, 3),
                                                (12, 4), (24, 1), (24, 2),
                                                (24, 3), (24, 4), (36, 1),
                                                (36, 2), (36, 3), (36, 4),
                                                (48, 1), (48, 2), (48, 3),
                                                (48, 4)],
                         'learning_rate_init': array([0.05, 0.1 , 0.15, 0.2 ]),
                         'max_iter': [200, 500, 800]})

In [94]:
print("Optimal Hyper-parameters : ", grid.best_params_)
print("Optimal Accuracy : ", grid.best_score_)

Optimal Hyper-parameters :  {'hidden_layer_sizes': (12, 1), 'learning_rate_init': 0.05, 'max_iter': 200}
Optimal Accuracy :  1.0
