Importing libraries

In [None]:
import os
import seaborn as sns 
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import matplotlib.pyplot as plt
from sklearn import ensemble, model_selection, metrics 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split , cross_val_score , RandomizedSearchCV
import xgboost as xgb

%pylab inline

In [None]:
df_test=pd.read_csv('/Users/olegmonahov/Downloads/test_lAUu6dG.csv')
df_train=pd.read_csv('/Users/olegmonahov/Downloads/train_ctrUa4K.csv')

<b>Exploratory Data analysis </b> <br>
At this stage, I'm visualizing different features to understand what might be a driver of application approvals

In [None]:
df_train.shape

In [None]:
fig, ax = plt.subplots()

sns.kdeplot(df_train[df_train['Loan_Status']=='N']['ApplicantIncome'], ax=ax)
sns.kdeplot(df_train[df_train['Loan_Status']=='Y']['ApplicantIncome'], ax=ax)

In [None]:
sns.countplot(df_train[df_train['Loan_Status']=='Y']['Married'])

At this stage I'm trying to check correlation of each feature with each other to understand what might be driving 

In [None]:
data1.describe()

In [None]:

correlation_heatmap(data1)

The issue that a lot of data is Object formats and as the model can work wiht numerical data only, we need to translate it to int formats

<h3>Dealing with categorical data </h3>

In [None]:
#In order to approach the problem of categorical data, we'll create a stand alone data frame to deal with objects
obj_df = df_train.select_dtypes(include=['object']).copy()


In [None]:
obj_df['Self_Employed'].value_counts()

Also we need to decide how to deal with missing data. In this case, I'll be just replacing it with the most common values

In [None]:
# e.g. changing all the missing values of gender because self employed is the most common option 

obj_df = obj_df.fillna({"Self_Employed": "No"})


In [None]:
obj_df['Gender'].value_counts()

In [None]:
obj_df = obj_df.fillna({"Gender": "Male"})


In [None]:
obj_df['Dependents'].value_counts()
obj_df = obj_df.fillna({"Dependents": "0"})


In [None]:
obj_df['Married'].value_counts()
obj_df = obj_df.fillna({"Married": "0"})


In [None]:
# checking shape of a new object dataframe
obj_df.shape

In this section I'm using .cat.codes to translate Object data to int

In [None]:
obj_df["Gender"] = obj_df["Gender"].astype('category')

obj_df["Gender"] = obj_df["Gender"].cat.codes

In [None]:

obj_df["Married"] = obj_df["Married"].astype('category')

obj_df["Married"] = obj_df["Married"].cat.codes

In [None]:

obj_df["Dependents"] = obj_df["Dependents"].astype('category')

obj_df["Dependents"] = obj_df["Dependents"].cat.codes


In [None]:

obj_df["Education"] = obj_df["Education"].astype('category')

obj_df["Education"] = obj_df["Education"].cat.codes


In [None]:

obj_df["Self_Employed"] = obj_df["Self_Employed"].astype('category')

obj_df["Self_Employed"] = obj_df["Self_Employed"].cat.codes


In [None]:

obj_df["Property_Area"] = obj_df["Property_Area"].astype('category')

obj_df["Property_Area"] = obj_df["Property_Area"].cat.codes


In [None]:

obj_df["Loan_Status"] = obj_df["Loan_Status"].astype('category')

obj_df["Loan_Status"] = obj_df["Loan_Status"].cat.codes


Lets look at the resulting dataframe 

In [None]:
obj_df.head()

In [None]:
obj_df.dtypes

In [None]:
obj_df.shape

So we achieved what was intended - getting all the categorical data as numerical values, dealing with NaNs succesfully 

<h3>Cleaning numerical data </h3>
At this stage we need to deal with missing values for numerical part of a dataset. For this let's build a stand alone DF with numerical data only

In [None]:
int_df = df_train.select_dtypes(include=['int','float64']).copy()
int_df = int_df.fillna({"LoanAmount": np.mean(int_df['LoanAmount'])})
int_df = int_df.fillna({"Loan_Amount_Term": np.mean(int_df['Loan_Amount_Term'])})
int_df = int_df.fillna({"Credit_History": 1})

In [None]:
int_df.shape

Let's try to concatenate both numerical and object part of data set to get the resulting dataframe 

In [None]:
result = pd.concat([int_df, obj_df], axis=1, join='inner')
result.head()

In [None]:

correlation_heatmap(result)

Dealing with outliers? 

It might be beneficial to get rid of outliers as they might be affecting results


In [None]:
result.describe()

In [None]:
result=result[result['ApplicantIncome']<34000]
result=result[result['CoapplicantIncome']<25300]

In [None]:
result.describe()

<h3>Dealing with multicollinearity</h3>

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [None]:
result.columns

In [None]:
Z=result[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
        'Credit_History', 'Gender', 'Married',
       'Dependents', 'Education', 'Self_Employed', 'Property_Area']]

In [None]:
sum(calc_vif(Z)['VIF'])

In [None]:
calc_vif(Z)

In [None]:
calc_vif(result[['ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount_Term',
        'Credit_History', 'Gender', 'Married',
       'Dependents', 'Education', 'Self_Employed', 'Property_Area']])

In [None]:
result=result.sort_values(by='Loan_ID')
y = result['Loan_Status']

In [None]:
df_x=result[['ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount_Term',
        'Credit_History', 'Gender', 'Married',
       'Dependents', 'Education', 'Self_Employed', 'Property_Area']]

<h3>Normalizing values in training set </h3>
Obviously data is coming with in a very different numerical ranges. In the next step I'll be scaling those different values

In [None]:
x = df_x.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)


In [None]:
x.shape

<h2>Random forest</h2>
<br>
I'll try to use Random forest as estimator for this excercise. I've approached to ML algo selection in a separate excercise and decided that RF is good enough. I'll spend more time on trying to understand what are the best parameters for selection 

In [None]:
estimator = ensemble.RandomForestClassifier()
rf=estimator.fit(x_scaled,y)

We have a task of selecting best possible parameters for the algorythm. I'll use the method RandomizedSearchCV for trying to find the best parameters out of those in range of rf_param

In [None]:
rf_param = {'n_estimators':list(range(5,270)) , 
         'min_samples_leaf':list(range(2,30)) , 
         'criterion':['gini','entropy'] ,
         'max_depth':list(range(1,40)),
            'min_samples_split':list(range(2,70))
}
rscv = RandomizedSearchCV(rf,
                          param_distributions=rf_param , 
                          cv =10 , n_iter=10 , scoring = 'accuracy',n_jobs =-1 , verbose =10)

In [None]:
rscv.fit(x_scaled,y);

Here I'm retrieving the best parameter configuration that RandomizedSearchCV gave us along with the best score. The idea is tbat by applying it to our estimator we'll get the best score at our training data 

In [None]:
print(rscv.best_score_)
print(rscv.best_estimator_)
print(rscv.best_index_)
print(rscv.best_params_)

Now, let's apply those parameters and see what score it gives on CV sets. We'll be looking at the model with different number of trees by wrapping estimator in for loop with the tree range

In [None]:
n_trees = [1] + list(range(10, 65, 5))

We'll be using CV with scoring aimed for accuracy. Let's store the score values in scoring list

In [None]:
%%time
scoring = []
for n_tree in n_trees:
    estimator = ensemble.RandomForestClassifier(n_estimators = n_tree, min_samples_split=15,
                                                criterion='entropy',max_depth=5, min_samples_leaf=23, warm_start=False)
    score = model_selection.cross_val_score(estimator, x_scaled, y, 
                                             scoring = 'accuracy', cv = 5)    
    scoring.append(score)
scoring = np.asmatrix(scoring)

Now let's plot and see how Accuracy changes with the number of leafs

In [None]:
pylab.plot(n_trees, scoring.mean(axis = 1), marker='.')
pylab.grid(True)
pylab.xlabel('estimators')
pylab.ylabel('score')
pylab.title('Accuracy ')
pylab.legend(loc='lower right')

<h3>XGBClassifier </h3>
<br>
Though we decided to use Random Forest as classifier, let's try and see how XGBClassifier will work 

In [None]:
estimator=xgb.XGBClassifier()
xgb_est=estimator.fit(x_scaled,y)

Fitting RandomizedSearchCV to find the best parameters

In [None]:
xgb_param = {
        'min_child_weight': [1, 3, 21],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'colsample_bytree': [0.3, 0.8, 1.0],
        'learning_rate':[0.001, 0.1, 0.005],
        'max_depth': [3, 4, 5]
        }

In [None]:
rscv = RandomizedSearchCV(xgb_est ,param_distributions=xgb_param ,  cv =5 , n_iter=10 , scoring = 'accuracy',n_jobs =-1 , verbose =10)
rscv.fit(x,y)


In [None]:
print(rscv.best_score_)
print(rscv.best_estimator_)
print(rscv.best_index_)
print(rscv.best_params_)

In [None]:
estimator = xgb.XGBClassifier(n_estimators=5, min_child_weight =3, max_depth=3, learning_rate=0.001,colsample_bytree=1.0)
score = model_selection.cross_val_score(estimator, x_scaled, y, 
                                             scoring = 'accuracy', cv = 10) 

In [None]:
score.mean()

In [None]:
%%time
xgb_scoring = []
for n_tree in n_trees:
    estimator = xgb.XGBClassifier(learning_rate=0.9, max_depth=10, booster='dart', n_estimators=n_tree, min_child_weight=5)
    score = model_selection.cross_val_score(estimator, x_scaled, y, 
                                             scoring = 'accuracy', cv = 10)    
    xgb_scoring.append(score)
xgb_scoring = np.asmatrix(xgb_scoring)

In [None]:
pylab.plot(n_trees, scoring.mean(axis = 1), marker='.', label='RandomForest')
pylab.plot(n_trees, xgb_scoring.mean(axis = 1), marker='.', label='XGBoost')
pylab.grid(True)
pylab.xlabel('n_trees')
pylab.ylabel('score')
pylab.title('Accuracy score')
pylab.legend(loc='lower right')

In [None]:
x.shape

ESTIMATOR IS HERE

In [None]:
estimator_final=ensemble.RandomForestClassifier(n_estimators = 40, min_samples_split=15,
                                                criterion='entropy',max_depth=5, min_samples_leaf=23, warm_start=False)
estimator_final.fit(x_scaled,y)

<h2>Transforming test data set </h2>
We'll need to do the same manipulations (fitting category data, dealing with missing values) for test data set as we did for train dataset

In [None]:
df_test.head()

In [None]:
obj_df_test = df_test.select_dtypes(include=['object']).copy()


In [None]:
obj_df_test = obj_df_test.fillna({"Self_Employed": "No"})


In [None]:
obj_df_test['Gender'].value_counts()


In [None]:
obj_df_test['Gender'].value_counts()
obj_df_test = obj_df_test.fillna({"Gender": "Male"})


In [None]:
obj_df_test['Dependents'].value_counts()
obj_df_test = obj_df_test.fillna({"Dependents": "0"})


In [None]:
obj_df_test['Self_Employed'].value_counts()


In [None]:
obj_df_test = obj_df_test.fillna({"Self_Employed": "No"})


In [None]:
obj_df_test[obj_df_test.isnull().any(axis=1)]


In [None]:

obj_df_test["Gender"] = obj_df_test["Gender"].astype('category')

obj_df_test["Gender"] = obj_df_test["Gender"].cat.codes
obj_df_test["Married"] = obj_df_test["Married"].astype('category')

obj_df_test["Married"] = obj_df_test["Married"].cat.codes
obj_df_test["Dependents"] = obj_df_test["Dependents"].astype('category')

obj_df_test["Dependents"] = obj_df_test["Dependents"].cat.codes
obj_df_test["Education"] = obj_df_test["Education"].astype('category')

obj_df_test["Education"] = obj_df_test["Education"].cat.codes
obj_df_test["Self_Employed"] = obj_df_test["Self_Employed"].astype('category')

obj_df_test["Self_Employed"] = obj_df_test["Self_Employed"].cat.codes
obj_df_test["Property_Area"] = obj_df_test["Property_Area"].astype('category')

obj_df_test["Property_Area"] = obj_df_test["Property_Area"].cat.codes


In [None]:
obj_df_test.head()

In [None]:
obj_df_test.shape

In [None]:
int_df_test = df_test.select_dtypes(include=['int','float64']).copy()


int_df_test = int_df_test.fillna({"LoanAmount": np.mean(int_df_test['LoanAmount'])})

int_df_test = int_df_test.fillna({"Loan_Amount_Term": np.mean(int_df_test['Loan_Amount_Term'])})

int_df_test = int_df_test.fillna({"Credit_History": 1})


In [None]:
int_df_test[int_df.isnull().any(axis=1)]


In [None]:
result_test = pd.concat([int_df_test, obj_df_test], axis=1, join='inner')

In [None]:
result_test[result_test.isnull().any(axis=1)]


In [None]:
result_test.shape

In [None]:
result_test.head()

In [None]:
X=result_test[['ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount_Term',
        'Credit_History', 'Gender', 'Married',
       'Dependents', 'Education', 'Self_Employed', 'Property_Area']]




In [None]:
x = X.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)


In [None]:
y_predict=estimator_final.predict(x_scaled)

In [None]:
predict_df=pd.DataFrame(y_predict)

Preparing resulting data frame for the output acceptable by competition

In [None]:
res=pd.concat([df_test['Loan_ID'], predict_df], axis=1, join='inner')

In [None]:
res.columns=['Loan_ID','Loan_Status']

In [None]:
res.columns=['Loan_ID','Loan_Status']
res['new_statys']=res['Loan_Status'].replace({1: "Y"})

In [None]:
res.columns=['Loan_ID','Loan_Status','new_statys']
# res['new_statys']=res['Loan_Status'].replace({0: "N"})
res['new_statys']=res['Loan_Status'].replace({0: "N", 1: "Y"})
res=res[['Loan_ID','new_statys']]

In [None]:
res.columns=['Loan_ID','Loan_Status']
res['new_statys']=res['Loan_Status'].replace({1: "Y"})
res['new_statys']=res['Loan_Status'].replace({0: "N", 1: "Y"})
res=res[['Loan_ID','new_statys']]
res.columns=['Loan_ID','Loan_Status']

In [None]:
res.head()

In [None]:
res.to_csv('submission_for_loan_competition_xgb12.csv',index=False)