In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# to ignore warnings
import warnings
warnings.filterwarnings("ignore")

import os

from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn import metrics
import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

import json
import pickle

In [2]:
df  = pd.read_csv("https://raw.githubusercontent.com/dphi-official/Datasets/master/Loan_Data/loan_train.csv" )
df.head()

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,LP002305,Female,No,0,Graduate,No,4547,0.0,115.0,360.0,1.0,Semiurban,1
1,1,LP001715,Male,Yes,3+,Not Graduate,Yes,5703,0.0,130.0,360.0,1.0,Rural,1
2,2,LP002086,Female,Yes,0,Graduate,No,4333,2451.0,110.0,360.0,1.0,Urban,0
3,3,LP001136,Male,Yes,0,Not Graduate,Yes,4695,0.0,96.0,,1.0,Urban,1
4,4,LP002529,Male,Yes,2,Graduate,No,6700,1750.0,230.0,300.0,1.0,Semiurban,1


In [3]:
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP002305,Female,No,0,Graduate,No,4547,0.0,115.0,360.0,1.0,Semiurban,1
1,LP001715,Male,Yes,3+,Not Graduate,Yes,5703,0.0,130.0,360.0,1.0,Rural,1
2,LP002086,Female,Yes,0,Graduate,No,4333,2451.0,110.0,360.0,1.0,Urban,0
3,LP001136,Male,Yes,0,Not Graduate,Yes,4695,0.0,96.0,,1.0,Urban,1
4,LP002529,Male,Yes,2,Graduate,No,6700,1750.0,230.0,300.0,1.0,Semiurban,1


In [4]:
df['Gender'] = df['Gender'].fillna('Male')
df['Married'] = df['Married'].fillna('Yes')
df['Dependents'] = df['Dependents'].fillna('0')
df['Self_Employed'] = df['Self_Employed'].fillna('Yes')
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].dropna().mean())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(360)
df['Credit_History'] = df['Credit_History'].fillna(1)

In [5]:
df['Dependents'] = df['Dependents'].str.rstrip('+')

df['Gender'] = df['Gender'].map({'Female':0,'Male':1}).astype(np.int)
df['Married'] = df['Married'].map({'No':0, 'Yes':1}).astype(np.int)
df['Education'] = df['Education'].map({'Not Graduate':0, 'Graduate':1}).astype(np.int)
df['Self_Employed'] = df['Self_Employed'].map({'No':0, 'Yes':1}).astype(np.int)
df['Dependents'] = df['Dependents'].astype(np.int)

In [6]:
X,y  = df.iloc[:, 1:-1], df.iloc[:, -1]

In [7]:
X= pd.get_dummies(X)

In [8]:
dtrain= pd.get_dummies(df)
train = dtrain
target = 'Loan_Status'
IDcol = 'Loan_ID'

In [9]:
#slc= StandardScaler()
#X_train_std = slc.fit_transform(X)

In [13]:
forest = RandomForestClassifier(n_estimators =400, criterion='entropy', oob_score=True, random_state=1,n_jobs=-1)

In [14]:
xgb1=XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=8, min_child_weight=6, gamma=0.1, subsample=0.9,
                     colsample_bytree=0.95,reg_alpha=2, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)

In [15]:
tree = DecisionTreeClassifier(criterion='entropy',max_depth=1)
ada = AdaBoostClassifier(base_estimator=tree, n_estimators=500, learning_rate=0.1, random_state=0)

In [16]:
#eclf = VotingClassifier(estimators=[('forest', forest), ('xgb', xgb1), ('adaboost', ada)], voting='hard')

In [17]:
model = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('eclf', VotingClassifier(estimators=[('forest', forest), ('xgb', xgb1), ('adaboost', ada)], voting='hard'))
])

In [18]:
model.fit(X, y)

Pipeline(steps=[('scaler', StandardScaler()),
                ('eclf',
                 VotingClassifier(estimators=[('forest',
                                               RandomForestClassifier(criterion='entropy',
                                                                      n_estimators=400,
                                                                      n_jobs=-1,
                                                                      oob_score=True,
                                                                      random_state=1)),
                                              ('xgb',
                                               XGBClassifier(base_score=None,
                                                             booster=None,
                                                             colsample_bylevel=None,
                                                             colsample_bynode=None,
                                                             c

-------------------------------------------------------------------------------------------------------------------------

In [23]:
def predict_loan(gender,married,dependents,education,self_employed,app_income,coapp_income,loan_amt,loan_amt_term,credit_his,prop_area):    
    x = np.zeros(13)
    x[0] = gender
    x[1] = married
    x[2] = dependents
    x[3] = education
    x[4] = self_employed
    x[5] = app_income
    x[6] = coapp_income
    x[7] = loan_amt
    x[8] = loan_amt_term
    x[9] = credit_his
    
    if prop_area == 1:
        x[10] = 1
        x[11] = 0
        x[12] = 0
    elif prop_area == 2:
        x[10] = 0
        x[11] = 1
        x[12] = 0
    else:
        x[10] = 0
        x[11] = 0
        x[12] = 1

    x = x.reshape(1,13)
   
    return model.predict(x)[0]

In [25]:
predict_loan(1,0,0,0,0,6096,5000,218,360,0,1)

0

In [26]:
filename = 'loan_approve.pkl'
pickle.dump(model, open(filename, 'wb'))