# Import Packages and Load Datasets

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [2]:
train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')
train = train_raw.copy()
test = test_raw.copy()

In [3]:
train.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y


In [4]:
train.shape

(614, 13)

# Split Data into Numpy Arrays for Features and Target

In [5]:
X, y = train.iloc[:, 1:12].values, train.iloc[:, 12].values

In [6]:
test = test.iloc[:, 1:]
test = test.values

In [7]:
X

array([['Male', 'No', '0', ..., 360.0, 1.0, 'Urban'],
       ['Male', 'Yes', '1', ..., 360.0, 1.0, 'Rural'],
       ['Male', 'Yes', '0', ..., 360.0, 1.0, 'Urban'],
       ...,
       ['Male', 'Yes', '1', ..., 360.0, 1.0, 'Urban'],
       ['Male', 'Yes', '2', ..., 360.0, 1.0, 'Urban'],
       ['Female', 'No', '0', ..., 360.0, 0.0, 'Semiurban']], dtype=object)

In [8]:
y

array(['Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y',
       'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y',
       'Y', 'Y', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y',
       'N', 'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N',
       'N', 'N', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'N',
       'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'N', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'N',
       'Y', 'Y', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'N

# Encode and Impute Values

In [9]:
le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
se = SimpleImputer(missing_values=np.nan, strategy='median')
ohe = OneHotEncoder(sparse=False)
mms = MinMaxScaler()
ct = make_column_transformer((ohe, [0, 1, 2, 3, 4, 9, 10]), (se, [5,6,7,8]), 
                              remainder='passthrough')

In [11]:
x_cross = ct.fit_transform(X)

In [12]:
x_cross = mms.fit_transform(x_cross)

# Tune Model Hyperparameters

In [13]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)

In [39]:
#xgb = XGBClassifier(booster='gblinear', eval_metric='error', objective='binary:hinge', updater='coord_descent', n_estimators=25, learning_rate=0.25, verbosity=0, random_state=7)
xgb = XGBClassifier(booster='gbtree', learning_rate=0.01, n_estimators=75, max_depth=1, min_child_weight=0, subsample=.05, verbosity=0, random_state=7)
dtf = DecisionTreeClassifier(max_depth=1, random_state=7)
lr = LogisticRegression(solver='liblinear', penalty='l1', random_state=7)

In [40]:
scores = cross_val_score(xgb, x_cross, y, cv=kfold)
print('Accuracy scores:', np.round(scores,2))
print('Accuracy mean:', np.round(scores.mean(),2))

Accuracy scores: [0.79 0.82 0.82 0.8  0.82]
Accuracy mean: 0.81


In [16]:
scores = cross_val_score(dtf, x_cross, y, cv=kfold)
print('Accuracy scores:', np.round(scores,2))
print('Accuracy mean:', np.round(scores.mean(),2))

Accuracy scores: [0.79 0.82 0.82 0.8  0.82]
Accuracy mean: 0.81


In [17]:
scores = cross_val_score(lr, x_cross, y, cv=kfold)
print('Accuracy scores:', np.round(scores,2))
print('Accuracy mean:', np.round(scores.mean(),2))

Accuracy scores: [0.79 0.81 0.82 0.79 0.82]
Accuracy mean: 0.81


In [18]:
def random_grid_search(model, params):
    r_grid = RandomizedSearchCV(model, params, n_iter=20, cv=kfold, n_jobs=-1)
    r_grid.fit(x_cross, y)
    best_params = r_grid.best_params_
    print('Best params', best_params)
    best_score = r_grid.best_score_
    print('Best score:', best_score) 

def grid_search(model, params):
    grid = GridSearchCV(model, params, cv=kfold, n_jobs=-1)
    grid.fit(x_cross, y)
    best_params = grid.best_params_
    print('Best params', best_params)
    best_score = grid.best_score_
    print('Best score:', best_score) 

In [20]:
#params={'penalty':['l1'],
#       'solver':['liblinear'],
#       'max_iter':[2}

In [21]:
#grid_search(model=lr, params=params)

In [22]:
#params={'booster':['gbtree', 'gblinear', 'dart'],
#       'eval_metric':['error'], 
#        'objective':['binary:logistic', 'binary:hinge'], 
#       }

In [23]:
#grid_search(xgb, params)

In [24]:
#params={'criterion':['gini','entropy'],
#       'splitter':['best','random'],
#       'max_depth':[None,1,2,3,4,5,6,7,8,9,10],
#       'min_samples_split':[1,2,3,4,5],
#       'min_samples_leaf':[1,2,3,4,5],
#       'max_features':[None,'auto','sqrt','log2']}

In [25]:
#grid_search(dtf, params)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [42]:
X_train = ct.fit_transform(X_train)

In [43]:
X_train = mms.fit_transform(X_train)

In [44]:
X_test = ct.transform(X_test)

In [45]:
X_test = mms.transform(X_test)

In [46]:
#lr.fit(X_train, y_train)

LogisticRegression(penalty='l1', random_state=7, solver='liblinear')

In [29]:
#xgb.fit(X_train, y_train, eval_metric='error', eval_set=[[X_test, y_test]], early_stopping_rounds=50)

In [47]:
#y_pred = lr.predict(X_test)
#accuracy = accuracy_score(y_test, y_pred)
#print('Accuracy: %.2f%%' % (accuracy * 100))

Accuracy: 82.47%


In [48]:
test = ct.transform(test)

In [49]:
test = mms.transform(test)

In [50]:
#y_sub = lr.predict(test)

In [51]:
#submission = pd.DataFrame(test_raw['Loan_ID'], index=test_raw.index)

In [52]:
#submission['Loan_Status'] = y_sub

In [53]:
#submission['Loan_Status'].replace({1:'Y', 0:'N'}, inplace=True)

In [54]:
#submission.to_csv('submission4.csv', index=False)