In [57]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('train_data_loan.csv')
df_test = pd.read_csv('test_data_loan.csv')

In [58]:
df_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [59]:
df_test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


-------

**Data Cleaning Train Set**

In [60]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [61]:
df_train.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [62]:
df_train.drop(columns=['Loan_ID'], inplace=True)
df_train['Gender'] = df_train['Gender'].map({
    'Male': 1,
    'Female': 0
})
df_train['Married'] = df_train['Married'].map({
    'Yes': 1,
    'No': 0
})
df_train['Dependents'] = df_train['Dependents'].map({
    '0': 0,
    '1': 1,
    '2': 2,
    '3+': 3
})
df_train['Education'] = df_train['Education'].map({
    'Graduate': 1,
    'Not Graduate': 0
})
df_train['Self_Employed'] = df_train['Self_Employed'].map({
    'Yes': 1,
    'No': 0
})
df_train['Property_Area'] = df_train['Property_Area'].map({
    'Urban': 0,
    'Rural': 1,
    'Semiurban': 2
})
df_train['Loan_Status'] = df_train['Loan_Status'].map({
    'Y': 1,
    'N': 0
})
df_train.rename(columns={
    'Gender': 'gender',
    'Married': 'married',
    'Dependents': 'dependents',
    'Education': 'education',
    'Self_Employed': 'self_employed',
    'ApplicantIncome': 'applicant_income',
    'CoapplicantIncome': 'coapplicant_income',
    'LoanAmount': 'loan_amount',
    'Loan_Amount_Term': 'loan_amount_term',
    'Credit_History': 'credit_history',
    'Property_Area': 'property_area',
    'Loan_Status': 'loan_status'
}, inplace=True)


In [63]:
df_train.isnull().sum()

gender                13
married                3
dependents            15
education              0
self_employed         32
applicant_income       0
coapplicant_income     0
loan_amount           22
loan_amount_term      14
credit_history        50
property_area          0
loan_status            0
dtype: int64

In [64]:
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
imputed_df_train = pd.DataFrame(my_imputer.fit_transform(df_train))
imputed_df_train.columns = df_train.columns
df_train = imputed_df_train

In [65]:
df_train['gender'] = df_train['gender'].astype('int64')
df_train['married'] = df_train['married'].astype('int64')
df_train['dependents'] = df_train['dependents'].astype('int64')
df_train['education'] = df_train['education'].astype('int64')
df_train['self_employed'] = df_train['self_employed'].astype('int64')
df_train['applicant_income'] = df_train['applicant_income'].astype('int64')
df_train['loan_amount'] = df_train['loan_amount'].astype('int64')
df_train['loan_amount_term'] = df_train['loan_amount_term'].astype('int64')
df_train['credit_history'] = df_train['credit_history'].astype('int64')
df_train['property_area'] = df_train['property_area'].astype('int64')
df_train['loan_status'] = df_train['self_employed'].astype('int64')

In [66]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_train.iloc[:, 5:9] = scaler.fit_transform(df_train.iloc[:, 5:9])
df_train.head()

Unnamed: 0,gender,married,dependents,education,self_employed,applicant_income,coapplicant_income,loan_amount,loan_amount_term,credit_history,property_area,loan_status
0,1,0,0,1,0,0.072991,-0.554487,-0.004733,0.279851,1,0,0
1,1,1,1,1,0,-0.134412,-0.038732,-0.219097,0.279851,1,1,0
2,1,1,0,1,1,-0.393747,-0.554487,-0.957465,0.279851,1,0,1
3,1,1,0,0,0,-0.462062,0.25198,-0.314371,0.279851,1,0,0
4,1,0,0,1,0,0.097728,-0.554487,-0.064278,0.279851,1,0,0


------

**Data Cleaning Test Set**

In [67]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [68]:
df_test.drop(columns=['Loan_ID'], inplace=True)
df_test['Gender'] = df_test['Gender'].map({
    'Male': 1,
    'Female': 0
})
df_test['Married'] = df_test['Married'].map({
    'Yes': 1,
    'No': 0
})
df_test['Dependents'] = df_test['Dependents'].map({
    '0': 0,
    '1': 1,
    '2': 2,
    '3+': 3
})
df_test['Education'] = df_test['Education'].map({
    'Graduate': 1,
    'Not Graduate': 0
})
df_test['Self_Employed'] = df_test['Self_Employed'].map({
    'Yes': 1,
    'No': 0
})
df_test['Property_Area'] = df_test['Property_Area'].map({
    'Urban': 0,
    'Rural': 1,
    'Semiurban': 2
})
df_test.rename(columns={
    'Gender': 'gender',
    'Married': 'married',
    'Dependents': 'dependents',
    'Education': 'education',
    'Self_Employed': 'self_employed',
    'ApplicantIncome': 'applicant_income',
    'CoapplicantIncome': 'coapplicant_income',
    'LoanAmount': 'loan_amount',
    'Loan_Amount_Term': 'loan_amount_term',
    'Credit_History': 'credit_history',
    'Property_Area': 'property_area'
}, inplace=True)


In [69]:
df_test.isnull().sum()

gender                11
married                0
dependents            10
education              0
self_employed         23
applicant_income       0
coapplicant_income     0
loan_amount            5
loan_amount_term       6
credit_history        29
property_area          0
dtype: int64

In [70]:
my_imputer_test = SimpleImputer()
imputed_df_test = pd.DataFrame(my_imputer_test.fit_transform(df_test))
imputed_df_test.columns = df_test.columns
df_test = imputed_df_test

In [71]:
df_test['gender'] = df_test['gender'].astype('int64')
df_test['married'] = df_test['married'].astype('int64')
df_test['dependents'] = df_test['dependents'].astype('int64')
df_test['education'] = df_test['education'].astype('int64')
df_test['self_employed'] = df_test['self_employed'].astype('int64')
df_test['applicant_income'] = df_test['applicant_income'].astype('int64')
df_test['loan_amount'] = df_test['loan_amount'].astype('int64')
df_test['loan_amount_term'] = df_test['loan_amount_term'].astype('int64')
df_test['credit_history'] = df_test['credit_history'].astype('int64')
df_test['property_area'] = df_test['property_area'].astype('int64')

In [72]:
df_test.iloc[:, 5:9] = scaler.transform(df_test.iloc[:, 5:9])
df_test.head()

Unnamed: 0,gender,married,dependents,education,self_employed,applicant_income,coapplicant_income,loan_amount,loan_amount_term,credit_history,property_area
0,1,1,0,1,0,0.051857,-0.554487,-0.433462,0.279851,1,0
1,1,1,1,1,0,-0.381297,-0.041468,-0.242916,0.279851,1,0
2,1,1,2,1,0,-0.066097,0.061136,0.733635,0.279851,1,0
3,1,1,2,1,0,-0.501872,0.316278,-0.552554,0.279851,0,0
4,1,0,0,0,0,-0.348532,-0.554487,-0.814555,0.279851,1,0


--------------

**Machine Learning**

In [73]:
x = df_train.iloc[:, 0:-1]
y = df_train['loan_status']

from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.3, random_state=3)

In [74]:
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Dense(units=512, activation='relu', input_shape=[11]),
    layers.Dense(units=512, activation='relu'),
    layers.Dense(units=512, activation='relu'),
    layers.Dense(units=2, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = model.fit(
    x_train, y_train,
    validation_data=(x_valid, y_valid),
    batch_size=512,
    epochs=50,
    verbose=0
)

In [80]:
history_df_train = pd.DataFrame(history.history)
print(history_df_train['accuracy'].max())
print(history_df_train['val_accuracy'].max())

1.0
1.0


In [76]:
y_pred_test = model.predict_classes(df_test)

In [77]:
df_test_csv = pd.read_csv('test_data_loan.csv')
df_pred_test = pd.DataFrame(df_test_csv['Loan_ID'])
df_pred_test['Loan_Status'] = y_pred_test
df_pred_test['Loan_Status'] = df_pred_test['Loan_Status'].map({
    0: 'N',
    1: 'Y'
})
df_pred_test.set_index('Loan_ID', inplace=True)
df_pred_test.to_csv('train_data_predict.csv')

In [78]:
df_pred_test.head()

Unnamed: 0_level_0,Loan_Status
Loan_ID,Unnamed: 1_level_1
LP001015,N
LP001022,N
LP001031,N
LP001035,N
LP001051,N


In [79]:
model.save('loan_model.h5')

In [81]:
import pickle

with open('scaler', 'wb') as f:
    pickle.dump(scaler, f)

pandas.core.frame.DataFrame