In [1]:
# Import Lib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
sns.set_style("whitegrid")

# นำเข้าข้อมูล Loan

In [2]:
df_train = pd.read_csv('Loan/Loan_Train.csv')

In [3]:
df_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df_train.tail()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y
613,LP002990,Female,No,0,Graduate,Yes,4583,0.0,133.0,360.0,0.0,Semiurban,N


In [5]:
df_train.shape

(614, 13)

# Variable	Description
- Loan_ID	 = Unique Loan ID
- Gender	 = Male/ Female
- Married	= Applicant married (Y/N)
- Dependents	= Number of dependents
- Education	Applicant Education = (Graduate/ Under Graduate)
- Self_Employed	= Self employed (Y/N)
- ApplicantIncome	= Applicant income
- CoapplicantIncome	= Coapplicant income
- LoanAmount	= Loan amount in thousands
- Loan_Amount_Term	= Term of loan in months
- Credit_History	= credit history meets guidelines
- Property_Area	= Urban/ Semi Urban/ Rural
- Loan_Status	= Loan approved (Y/N)

# ประเภทของ data

In [6]:
df_train.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

# การทำ Feature Engineering

In [7]:
df_pre = df_train.copy()

In [8]:
df_pre.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


# หา Null หรือ NA

In [9]:
df_pre.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# Fill NA ด้วยค่า mean

In [10]:
df_train.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [11]:
df_pre['LoanAmount'].fillna(df_pre['LoanAmount'].mean(),inplace=True)
df_pre['Loan_Amount_Term'].fillna(df_pre['Loan_Amount_Term'].mean(),inplace=True)
df_pre['Credit_History'].fillna(df_pre['Credit_History'].mean(),inplace=True)
df_pre.dropna(inplace=True)
df_pre.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [12]:
df_pre.shape

(554, 13)

# Convert String feture to Numerical

## Gender 
Male = 0
Female = 1

In [13]:
df_pre.loc[(df_pre['Gender']=='Male'),'Gender'] = 0
df_pre.loc[(df_pre['Gender']=='Female'),'Gender'] = 1

## Married
No = 0
Yes = 1

In [14]:
df_pre.loc[(df_pre['Married']=='No'),'Married'] = 0.0
df_pre.loc[(df_pre['Married']=='Yes'),'Married'] = 1.0

## Dependents
Dependents 0 = 0
Dependents 1 = 1
Dependents 2 = 2
Dependents 3+ = 3

In [15]:
df_pre.loc[(df_pre['Dependents']=='0'),'Dependents'] = 0.0
df_pre.loc[(df_pre['Dependents']=='1'),'Dependents'] = 1.0
df_pre.loc[(df_pre['Dependents']=='2'),'Dependents'] = 2.0
df_pre.loc[(df_pre['Dependents']=='3+'),'Dependents'] = 3.0

## Self_Employed
No = 0
Yes = 1

In [16]:
df_pre.loc[(df_pre['Self_Employed']=='No'),'Self_Employed'] = 0.0
df_pre.loc[(df_pre['Self_Employed']=='Yes'),'Self_Employed'] = 1.0

## Education
Not Graduate = 0
Graduate = 1

In [17]:
df_pre.loc[(df_pre['Education']=='Not Graduate'),'Education'] = 0.0
df_pre.loc[(df_pre['Education']=='Graduate'),'Education'] = 1.0

## Property_Area
Urban = 0
Rural = 1
Semiurban = 2

In [18]:
df_pre.loc[(df_pre['Property_Area']=='Urban'),'Property_Area'] = 0.0
df_pre.loc[(df_pre['Property_Area']=='Rural'),'Property_Area'] = 1.0
df_pre.loc[(df_pre['Property_Area']=='Semiurban'),'Property_Area'] = 2.0

## Loan_Status
N = 0
Y = 1

In [19]:
df_pre.loc[(df_pre['Loan_Status']=='N'),'Loan_Status'] = 0.0
df_pre.loc[(df_pre['Loan_Status']=='Y'),'Loan_Status'] = 1.0

In [20]:
df_pre.drop('Loan_ID',axis=1,inplace=True)

In [21]:
df_pre.head(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,0,0,1,0,5849,0.0,146.412162,360.0,1.0,0,1
1,0,1,1,1,0,4583,1508.0,128.0,360.0,1.0,1,0
2,0,1,0,1,1,3000,0.0,66.0,360.0,1.0,0,1
3,0,1,0,0,0,2583,2358.0,120.0,360.0,1.0,0,1
4,0,0,0,1,0,6000,0.0,141.0,360.0,1.0,0,1
5,0,1,2,1,1,5417,4196.0,267.0,360.0,1.0,0,1
6,0,1,0,0,0,2333,1516.0,95.0,360.0,1.0,0,1
7,0,1,3,1,0,3036,2504.0,158.0,360.0,0.0,2,0
8,0,1,2,1,0,4006,1526.0,168.0,360.0,1.0,0,1
9,0,1,1,1,0,12841,10968.0,349.0,360.0,1.0,2,0


# Neural Network

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix,mean_squared_error,r2_score,accuracy_score
import sklearn.metrics as metric
from sklearn.linear_model import LogisticRegression

In [23]:
X = df_pre.drop('Loan_Status',axis=1)
y = df_pre['Loan_Status']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=42)

In [25]:
X_train.head(10)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
31,0,0,0,1,0,3167,0.0,74.0,360.0,1.0,0
516,1,1,2,1,0,2031,1632.0,113.0,480.0,1.0,2
48,1,1,0,1,0,2645,3440.0,120.0,360.0,0.0,0
393,0,1,2,0,0,1993,1625.0,113.0,180.0,1.0,2
210,1,0,0,1,0,10000,0.0,214.0,360.0,1.0,2
202,0,1,3,0,0,3992,0.0,146.412162,180.0,1.0,0
233,1,0,0,1,0,8333,0.0,280.0,360.0,1.0,2
359,0,1,3,1,0,5167,3167.0,200.0,360.0,1.0,2
169,0,1,2,1,0,8000,0.0,200.0,360.0,1.0,2
140,0,1,2,1,0,5042,2083.0,185.0,360.0,1.0,1


In [26]:
df_pre = df_pre.astype(float)

scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  
  import sys


In [27]:
X_train

array([[-0.47809144, -1.35085807, -0.7363235 , ...,  0.29741168,
         0.44359981, -1.28180538],
       [ 2.09165007,  0.74027022,  1.26337611, ...,  2.11427544,
         0.44359981,  1.12041584],
       [ 2.09165007,  0.74027022, -0.7363235 , ...,  0.29741168,
        -2.46366791, -1.28180538],
       ...,
       [-0.47809144,  0.74027022,  0.2635263 , ...,  0.29741168,
         0.44359981,  1.12041584],
       [-0.47809144,  0.74027022, -0.7363235 , ...,  0.29741168,
         0.44359981,  1.12041584],
       [ 2.09165007, -1.35085807,  0.2635263 , ...,  0.29741168,
         0.44359981,  1.12041584]])

## Multilayer perceptron

### Sigmoid

In [28]:
mlp = MLPClassifier(hidden_layer_sizes=(11,18,1),max_iter=1000,activation='logistic')

In [29]:
mlp.fit(X_train,y_train)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(11, 18, 1), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [30]:
predictions = mlp.predict(X_test)

In [31]:
accuracy = accuracy_score(y_test,predictions)
err = mean_squared_error(y_test,predictions)
print("Accuracy: %.4f%%" % (accuracy * 100.0))
print("Mean Square Error: %.4f%%" % (err * 100.0))

Accuracy: 84.4311%
Mean Square Error: 15.5689%


In [32]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[ 24  25]
 [  1 117]]
              precision    recall  f1-score   support

           0       0.96      0.49      0.65        49
           1       0.82      0.99      0.90       118

   micro avg       0.84      0.84      0.84       167
   macro avg       0.89      0.74      0.77       167
weighted avg       0.86      0.84      0.83       167



### Tanh

In [33]:
mlp = MLPClassifier(hidden_layer_sizes=(11,50,1),max_iter=1000,activation='tanh')
mlp.fit(X_train,y_train)



MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(11, 50, 1), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [34]:
predictions = mlp.predict(X_test)

In [35]:
accuracy = accuracy_score(y_test,predictions)
err = mean_squared_error(y_test,predictions)
print("Accuracy: %.4f%%" % (accuracy * 100.0))
print("Mean Square Error: %.4f%%" % (err * 100.0))

Accuracy: 74.2515%
Mean Square Error: 25.7485%


In [36]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[34 15]
 [28 90]]
              precision    recall  f1-score   support

           0       0.55      0.69      0.61        49
           1       0.86      0.76      0.81       118

   micro avg       0.74      0.74      0.74       167
   macro avg       0.70      0.73      0.71       167
weighted avg       0.77      0.74      0.75       167



### ReLu

In [37]:
mlp = MLPClassifier(hidden_layer_sizes=(11,50,1),max_iter=1000,activation='relu')
mlp.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(11, 50, 1), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [38]:
predictions = mlp.predict(X_test)

In [39]:
accuracy = accuracy_score(y_test,predictions)
err = mean_squared_error(y_test,predictions)
print("Accuracy: %.4f%%" % (accuracy * 100.0))
print("Mean Square Error: %.4f%%" % (err * 100.0))

Accuracy: 70.6587%
Mean Square Error: 29.3413%


In [40]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[  0  49]
 [  0 118]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        49
           1       0.71      1.00      0.83       118

   micro avg       0.71      0.71      0.71       167
   macro avg       0.35      0.50      0.41       167
weighted avg       0.50      0.71      0.59       167



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Machine learning

## Decision Tree Classifier

In [41]:
from sklearn.tree import DecisionTreeClassifier

In [42]:
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [43]:
predictions = dtree.predict(X_test)

In [44]:
accuracy = accuracy_score(y_test, predictions)
err = mean_squared_error(y_test,predictions)
print("Accuracy: %.4f%%" % (accuracy * 100.0))
print("Mean Square Error: %.4f%%" % (err * 100.0))

Accuracy: 77.2455%
Mean Square Error: 22.7545%


In [45]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[32 17]
 [21 97]]
              precision    recall  f1-score   support

           0       0.60      0.65      0.63        49
           1       0.85      0.82      0.84       118

   micro avg       0.77      0.77      0.77       167
   macro avg       0.73      0.74      0.73       167
weighted avg       0.78      0.77      0.77       167



# Xgboost Tree

In [46]:
from numpy import loadtxt
from xgboost import XGBClassifier

In [47]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [48]:
predictions = xgb.predict(X_test)

In [49]:
accuracy = accuracy_score(y_test, predictions)
err = mean_squared_error(y_test,predictions)
print("Accuracy: %.4f%%" % (accuracy * 100.0))
print("Mean Square Error: %.4f%%" % (err * 100.0))

Accuracy: 84.4311%
Mean Square Error: 15.5689%


In [50]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[ 30  19]
 [  7 111]]
              precision    recall  f1-score   support

           0       0.81      0.61      0.70        49
           1       0.85      0.94      0.90       118

   micro avg       0.84      0.84      0.84       167
   macro avg       0.83      0.78      0.80       167
weighted avg       0.84      0.84      0.84       167



In [51]:
# df_pre.loc[(df_pre['Loan_Status']==1),'Loan_Status'] = 'Y'
# df_pre.loc[(df_pre['Loan_Status']==0),'Loan_Status'] = 'N'

In [52]:
# df_pre.head()

In [53]:
# df_pre.to_csv('LoneMATLAB', sep='\t', encoding='utf-8')