In [None]:
# -------------------------------------import libraries and data--------------------------------

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

In [None]:
# read the dataset in dataframe using pandas
df = pd.read_csv('train_data.csv')

In [None]:
# ------------------------------------quick data exploration-------------------------------------

In [None]:
# print first 10 rows of dataset
df.head(10)

In [None]:
# get summary of numerical variables
df.describe()

# Few inferences from summary of numerical variables:
# LoanAmount has (614 – 592) 22 missing values.
# Loan_Amount_Term has (614 – 600) 14 missing values.
# Credit_History has (614 – 564) 50 missing values.
# We can also look that about 84% applicants have a credit_history. {How? 
#     The mean of Credit_History field is 0.84 (Remember, Credit_History 
#     has value 1 for those who have a credit history and 0 otherwise)}
# The ApplicantIncome distribution seems to be in line with expectation. Same with CoapplicantIncome

In [None]:
# frequency distributions for non-numerical variables

# Gender frequencies
df['Gender'].value_counts()

# Married frequencies
df['Married'].value_counts()

# Dependents frequencies
df['Dependents'].value_counts()

# Education frequencies
df['Education'].value_counts()

# Self_Employed frequencies
df['Self_Employed'].value_counts()

# Credit_History frequencies
df['Credit_History'].value_counts()

# Property_Area frequencies
df['Property_Area'].value_counts()

# Loan_Status frequencies
df['Loan_Status'].value_counts()

In [None]:
# ---------------------------------------Distribution analysis----------------------------------------

In [None]:
# plotting the histogram of ApplicantIncome
df['ApplicantIncome'].hist(bins=50)

# box plot of ApplicantIncome
df.boxplot(column='ApplicantIncome')

# This confirms the presence of a lot of outliers/extreme values. 
# This can be attributed to the income disparity in the society. Part 
# of this can be driven by the fact that we are looking at people with 
# different education levels. Let us segregate them by Education:
df.boxplot(column='ApplicantIncome', by='Education')

# We can see that there is no substantial different between the mean income 
# of graduate and non-graduates. But there are a higher number of graduates 
# with very high incomes, which are appearing to be the outliers.

In [None]:
# plotting histogram of LoanAmount
df['LoanAmount'].hist(bins=50)

# box plot of LoanAmount
df.boxplot(column='LoanAmount')

# extreme values present

In [None]:
# ------------------------------Categorical variable analysis-----------------------------

In [None]:
import matplotlib.pyplot as plt

In [None]:
Credit_History_freq = df['Credit_History'].value_counts(ascending=True)
print('Frequency Table for Credit History:')
print(Credit_History_freq)

P_by_CreditHistory_class = df.pivot_table(values='Loan_Status', index=['Credit_History'], aggfunc=lambda x: x.map(
    {'Y': 1, 'N': 0}).mean())
print('\nProbability of getting loan for each Credit History class:')
print(P_by_CreditHistory_class)
P_by_CreditHistory_class = P_by_CreditHistory_class.squeeze()

fig = plt.figure(figsize=(8, 4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Credit_History')
ax1.set_ylabel('Count of Applicants')
ax1.set_title("Applicants by Credit_History")
Credit_History_freq.plot(kind='bar')

ax2 = fig.add_subplot(122)
ax2.set_xlabel('Credit_History')
ax2.set_ylabel('Probability of getting loan')
ax2.set_title("Probability of getting loan by credit history")
P_by_CreditHistory_class.plot(kind='bar')
# chances of getting a loan are eight-fold if the applicant has a valid credit history

CreditHistory_LoanStatus_comb_stacked = pd.crosstab(df['Credit_History'], df['Loan_Status'])
CreditHistory_LoanStatus_comb_stacked.plot(kind='bar', stacked=True, color=['red', 'blue'], grid=False)

CreditHistoryGender_LoanStatus_comb_stacked = pd.crosstab([df['Credit_History'], df['Gender']], df['Loan_Status'])
CreditHistoryGender_LoanStatus_comb_stacked.plot(kind='bar', stacked=True, color=['red', 'blue'], grid=False)
plt.tight_layout()

In [None]:
Married_freq = df['Married'].value_counts(ascending=True)
print('Frequency Table for Married:')
print(Married_freq)

P_by_MaritalStat = df.pivot_table(values='Loan_Status', index=['Married'], aggfunc=lambda x: x.map(
    {'Y': 1, 'N': 0}).mean())
print('\nProbability of getting loan for Marital statuses:')
print(P_by_MaritalStat)
P_by_MaritalStat = P_by_MaritalStat.squeeze()

fig = plt.figure(figsize=(8, 4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Marital status')
ax1.set_ylabel('Count of Applicants')
ax1.set_title("Applicants by Marital status")
Married_freq.plot(kind='bar')

ax2 = fig.add_subplot(122)
ax2.set_xlabel('Married')
ax2.set_ylabel('Probability of getting loan')
ax2.set_title("Probability of getting loan by marital status")
P_by_MaritalStat.plot(kind='bar')
# chances of getting a loan are mildly higher (10 %) if the applicant is married

MaritalStat_LoanStatus_comb_stacked = pd.crosstab(df['Married'], df['Loan_Status'])
MaritalStat_LoanStatus_comb_stacked.plot(kind='bar', stacked=True, color=['red', 'blue'], grid=False)

MarriedGender_LoanStatus_comb_stacked = pd.crosstab([df['Married'], df['Gender']], df['Loan_Status'])
MarriedGender_LoanStatus_comb_stacked.plot(kind='bar', stacked=True, color=['red', 'blue'], grid=False)
plt.tight_layout()

In [None]:
SelfEmployed_freq = df['Self_Employed'].value_counts(ascending=True)
print('Frequency Table for Self_Employed:')
print(SelfEmployed_freq)

P_by_SelfEmployed = df.pivot_table(values='Loan_Status', index=['Self_Employed'], aggfunc=lambda x: x.map(
    {'Y': 1, 'N': 0}).mean())
print('\nProbability of getting loan if Self-Employed:')
print(P_by_SelfEmployed)
P_by_SelfEmployed = P_by_SelfEmployed.squeeze()

fig = plt.figure(figsize=(8, 4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Self-Employed')
ax1.set_ylabel('Count of Applicants')
ax1.set_title("Applicants by Self-Employed")
SelfEmployed_freq.plot(kind='bar')

ax2 = fig.add_subplot(122)
ax2.set_xlabel('Self-Employed')
ax2.set_ylabel('Probability of getting loan')
ax2.set_title("Probability of getting loan if self-employed")
P_by_SelfEmployed.plot(kind='bar')
# identical chances of getting a loan for self- and unself-employed applicants

SelfEmpl_LoanStatus_comb_stacked = pd.crosstab(df['Self_Employed'], df['Loan_Status'])
SelfEmpl_LoanStatus_comb_stacked.plot(kind='bar', stacked=True, color=['red', 'blue'], grid=False)

SelfEmplGender_LoanStatus_comb_stacked = pd.crosstab([df['Self_Employed'], df['Gender']], df['Loan_Status'])
SelfEmplGender_LoanStatus_comb_stacked.plot(kind='bar', stacked=True, color=['red', 'blue'], grid=False)
plt.tight_layout()

In [None]:
PropertyArea_freq = df['Property_Area'].value_counts(ascending=True)
print('Frequency Table for property areas:')
print(PropertyArea_freq)

P_by_PropertyArea = df.pivot_table(values='Loan_Status', index=['Property_Area'], aggfunc=lambda x: x.map(
    {'Y': 1, 'N': 0}).mean())
print('\nProbability of getting loan by property area:')
print(P_by_PropertyArea)
P_by_PropertyArea = P_by_PropertyArea.squeeze()

fig = plt.figure(figsize=(8, 4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Property area')
ax1.set_ylabel('Count of Applicants')
ax1.set_title("Applicants by property area")
PropertyArea_freq.plot(kind='bar')

ax2 = fig.add_subplot(122)
ax2.set_xlabel('Property area')
ax2.set_ylabel('Probability of getting loan')
ax2.set_title("Probability of getting loan by property area")
P_by_PropertyArea.plot(kind='bar')
# chances of getting a loan are best for applicants living semirural areas

PropertyArea_LoanStatus_comb_stacked = pd.crosstab(df['Property_Area'], df['Loan_Status'])
PropertyArea_LoanStatus_comb_stacked.plot(kind='bar', stacked=True, color=['red', 'blue'], grid=False)

PropertyAreaGender_LoanStatus_comb_stacked = pd.crosstab([df['Property_Area'], df['Gender']], df['Loan_Status'])
PropertyAreaGender_LoanStatus_comb_stacked.plot(kind='bar', stacked=True, color=['red', 'blue'], grid=False)
plt.tight_layout()

PropertyAreaMarried_LoanStatus_comb_stacked = pd.crosstab([df['Property_Area'], df['Married']], df['Loan_Status'])
PropertyAreaMarried_LoanStatus_comb_stacked.plot(kind='bar', stacked=True, color=['red', 'blue'], grid=False)
plt.tight_layout()
# semiurban + married is best combo, rural + no married & urban + no married are the worst

PropertyAreaEducation_LoanStatus_comb_stacked = pd.crosstab([df['Property_Area'], df['Education']], df['Loan_Status'])
PropertyAreaEducation_LoanStatus_comb_stacked.plot(kind='bar', stacked=True, color=['red', 'blue'], grid=False)
plt.tight_layout()
# urban + not graduated and rural + not graduated are worst combos for chances of getting loan
# semiurban + graduated is the best combo

In [None]:
Education_freq = df['Education'].value_counts(ascending=True)
print('Frequency Table for education:')
print(Education_freq)

P_by_Education = df.pivot_table(values='Loan_Status', index=['Education'], aggfunc=lambda x: x.map(
    {'Y': 1, 'N': 0}).mean())
print('\nProbability of getting loan by education:')
print(P_by_Education)
P_by_Education = P_by_Education.squeeze()

fig = plt.figure(figsize=(8, 4))
ax1 = fig.add_subplot(121)
ax1.set_xlabel('Education')
ax1.set_ylabel('Count of Applicants')
ax1.set_title("Applicants by education")
Education_freq.plot(kind='bar')

ax2 = fig.add_subplot(122)
ax2.set_xlabel('Education')
ax2.set_ylabel('Probability of getting loan')
ax2.set_title("Probability of getting loan by education")
P_by_Education.plot(kind='bar')
# chances of getting a loan are better for more educated applicants

Education_LoanStatus_comb_stacked = pd.crosstab(df['Education'], df['Loan_Status'])
Education_LoanStatus_comb_stacked.plot(kind='bar', stacked=True, color=['red', 'blue'], grid=False)

EducationGender_LoanStatus_comb_stacked = pd.crosstab([df['Education'], df['Gender']], df['Loan_Status'])
EducationGender_LoanStatus_comb_stacked.plot(kind='bar', stacked=True, color=['red', 'blue'], grid=False)
plt.tight_layout()
# graduated + male lifts chances of getting loan compared to other combos

EducationMarried_LoanStatus_comb_stacked = pd.crosstab([df['Education'], df['Married']], df['Loan_Status'])
EducationMarried_LoanStatus_comb_stacked.plot(kind='bar', stacked=True, color=['red', 'blue'], grid=False)
plt.tight_layout()
# graduated + married lifts chances of getting loan compared to other combos

In [None]:
# ---------------------------------------------Data munging-------------------------------------------------------

In [None]:
# --------------------------------Checking missing values---------------------------------------

In [None]:
# number of missing values in dataframe column
df.apply(lambda x: sum(x.isnull()),axis=0)

# Education, ApplicantIncome, CoapplicantIncome, PropertyArea, LoanStatus with 0 missing values

In [None]:
# Self_Employed

# 32 missing values
# 500 non-self-employed (86 %) and 82 self-employed (14 %)

# It seems that median incomes of self-employed persons are higher than non-self-employed people
df.pivot_table(values='ApplicantIncome', index='Credit_History', columns='Self_Employed', aggfunc=np.median)

df.loc[df['ApplicantIncome'] & (df['Self_Employed'] == 'Yes')].median()
# median income is 5677 for self-employed
df.loc[df['ApplicantIncome'] & (df['Self_Employed'] == 'No')].median()
# median income is 3588 for self-employed

# if person's incomes exceed median income of self-employed, person get's self-employed status
df['Self_Employed'].loc[(df['ApplicantIncome'] > 5677)].fillna('Yes', inplace=True)
# rest get status of non-self-employed
df['Self_Employed'].fillna('No', inplace=True)

In [None]:
# LoanAmount

# replace missing values with mean: df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace=True)

# A key hypothesis is that the whether a person is educated or self-employed 
# can combine to give a good estimate of loan amount.
df.boxplot(column='LoanAmount', by=['Education', 'Self_Employed'])
plt.tight_layout()
# some variations in the median of loan amount for each group and this can be used to impute the values

# create a Pivot table, which provides us median values for all the groups of unique values 
# of Self_Employed and Education features
table = df.pivot_table(values='LoanAmount', index='Self_Employed' ,columns='Education', aggfunc=np.median)
# Define function to return value of this pivot_table
def fage(x):
 return table.loc[x['Self_Employed'],x['Education']]
# Replace missing values of LoanAmounts
df['LoanAmount'].fillna(df[df['LoanAmount'].isnull()].apply(fage, axis=1), inplace=True)

In [None]:
# Next Married

df['Married'].value_counts()
# Circa 65 % are married and circa 35 % are not. 3 missing values
# Let's replace 2 NaN:s in Married column with 'Yes' and 1 with 'No'.   
df['Married'].fillna('Yes', limit=2, inplace=True)
df['Married'].fillna('No', inplace=True)

In [None]:
# Gender

df['Gender'].value_counts()
# About 82 % are males and 18 % females, 13 missing values
# let's replace missing values with same proportions: 11 Males and 2 Females
df['Gender'].fillna('Male', limit=11, inplace=True)
df['Gender'].fillna('Female', inplace=True)

In [None]:
# Dependents

# Hypothesis is that amount of dependents depends on marital status, education and area of property
pd.crosstab(df['Dependents'], [df['Married'], df['Education'], df['Property_Area']])
# 0 dependents is most common in dataset, 
# but married, graduated people living in semiurban or urban
# areas are more likely to have 1 or 2 dependents than others

# ApplicantIncome might affect on the amount of dependents
df.pivot_table(values='ApplicantIncome', index='Dependents' ,columns='Education', aggfunc=np.median)
# graduated people living in rural areas, whose income is over 5000, are more likely to have three or more dependents

# Now let's fill missings
# First, if person lives in rural area, her/his income is over 5000 and she is graduated, filling is made with 3+ 
df['Dependents'].loc[(df['Property_Area'] == 'Rural') & (df['Education'] == 'Graduate') & (df['ApplicantIncome'] > 5000)].fillna('3+', inplace=True)
# Second married, graduated people living in urban or semiurban areas; filling is made with random dependent amount
df['Dependents'].loc[(df['Property_Area'] != 'Rural') & (df['Education'] == 'Graduate') & (df['Married'] == 'Yes')].fillna(lambda x: np.random.choice(df[df['Dependents'] != np.nan]["Dependents"]), inplace=True)
    
# And rest of the missing values are replaces with 0 dependents, which is the most common value
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)

In [None]:
# Loan_Amount_Term

# Get's replaced by the most common duration (mode)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True)

In [None]:
# Credit_History

df['Credit_History'].value_counts()
# Circa 84 %'s credit history meets the guidelines whereas 16 %'s doesn't
# 50 missing values

# let's replace 8 (16 %) missing values with 0.0 and 42 missing values with 1.0
df['Credit_History'].fillna(0.0, limit=8, inplace=True)
df['Credit_History'].fillna(1.0, inplace=True)

In [None]:
# ------------------------------------Handling extreme values--------------------------------------

In [None]:
# Loan Amount

df['LoanAmount'].hist(bins=50)
df.boxplot(column='LoanAmount')
# Obvious extreme values
# let’s try a log transformation to nullify their effect
df['LoanAmount_log'] = np.log(df['LoanAmount'])
df['LoanAmount_log'].hist(bins=20)

In [None]:
# Applicant Income

df['ApplicantIncome'].hist(bins=50)
df.boxplot(column='ApplicantIncome')
# Obvious extreme values

# One intuition can be that some applicants have lower income but strong support Co-applicants. 
# So it might be a good idea to combine both incomes as total income and take a log transformation of the same.
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df['TotalIncome'].hist(bins=50) 
df['TotalIncome_log'] = np.log(df['TotalIncome'])
df['TotalIncome_log'].hist(bins=20)

In [None]:
# Let's make a new variable that measures applicants capability of paying back her/his loan
df['LoanAmount/TotalIncome'] = df['LoanAmount'] / df['TotalIncome']

In [None]:
# -----------------------------Building predictive models-----------------------------------

In [None]:
# sklearn requires all inputs to be numeric, we should convert all our categorical variables 
# into numeric by encoding the categories
from sklearn.preprocessing import LabelEncoder
var_mod = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']
le = LabelEncoder()
for i in var_mod:
    df[i] = le.fit_transform(df[i])
df.dtypes 

In [None]:
#Import models from scikit learn module:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics

In [None]:
#Generic function for making a classification model and accessing performance:
def classification_model(model, data, predictors, outcome):
  #Fit the model:
  model.fit(data[predictors],data[outcome])
  
  #Make predictions on training set:
  predictions = model.predict(data[predictors])
  
  #Print accuracy
  accuracy = metrics.accuracy_score(predictions,data[outcome])
  print ("Accuracy : %s" % "{0:.3%}".format(accuracy))

  #Perform k-fold cross-validation with 5 folds
  kf = KFold(n_splits=5)
  error = []
  for train, test in kf.split(data):
    # Filter training data
    train_predictors = (data[predictors].iloc[train,:])
    
    # The target we're using to train the algorithm.
    train_target = data[outcome].iloc[train]
    
    # Training the algorithm using the predictors and target.
    model.fit(train_predictors, train_target)
    
    #Record error from each cross-validation run
    error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
 
  print ("Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error)))

  #Fit the model again so that it can be refered outside the function:
  model.fit(data[predictors],data[outcome])

In [None]:
# ------------------------------------Logistic regression---------------------------------------

In [None]:
#The chances of getting a loan will be higher for:
#  Applicants having a credit history (remember we observed this in exploration?)
#  Applicants with higher applicant and co-applicant incomes
#  Applicants with higher education level
#  Properties in urban areas with high growth perspectives

outcome_var = 'Loan_Status'
model = LogisticRegression()
predictor_var = ['Credit_History']
classification_model(model, df,predictor_var,outcome_var)
# Accuracy : 80.945%, Cross-Validation Score : 80.946%

#We can try different combination of variables:
predictor_var = ['Credit_History','Education','Married','Self_Employed','Property_Area']
classification_model(model, df,predictor_var,outcome_var)
# Accuracy : 80.945%, Cross-Validation Score : 80.946%

# Accuracy or cross-validation did not improve with additional or alternative variables

In [None]:
# -----------------------------------Decision tree-------------------------------------------------

In [None]:
model = DecisionTreeClassifier()
predictor_var = ['Credit_History', 'Gender','Married','Education']
classification_model(model, df,predictor_var,outcome_var)
# Accuracy : 80.945%, Cross-Validation Score : 80.946%

# Here the model based on categorical variables is unable to 
# have an impact because Credit History is dominating over them. Let’s try a few numerical variables:
# We can try different combination of variables:
predictor_var = ['Credit_History','Loan_Amount_Term','LoanAmount_log']
classification_model(model, df,predictor_var,outcome_var)
# Accuracy : 89.088%, Cross-Validation Score : 69.535%

predictor_var = ['Credit_History', 'LoanAmount/TotalIncome', 'Loan_Amount_Term']
classification_model(model, df,predictor_var,outcome_var)
# Accuracy : 100.000%, Cross-Validation Score : 69.712%

predictor_var = ['Credit_History', 'LoanAmount/TotalIncome', 'Loan_Amount_Term', 'Property_Area']
classification_model(model, df,predictor_var,outcome_var)
# Accuracy : 100.000%, Cross-Validation Score : 71.172%

predictor_var = ['Credit_History', 'LoanAmount/TotalIncome', 'Loan_Amount_Term', 'Property_Area', 'Self_Employed']
classification_model(model, df,predictor_var,outcome_var)
# Accuracy : 100.000%, Cross-Validation Score : 71.661%

# Accuracy got better, but cross-validation score got worse when adding up more variables

In [None]:
# -------------------------------------Random forest----------------------------------------------

In [None]:
model = RandomForestClassifier(n_estimators=100)
predictor_var = ['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Loan_Amount_Term', 'Credit_History', 'Property_Area',
        'LoanAmount_log','TotalIncome_log']
classification_model(model, df,predictor_var,outcome_var)
# Accuracy : 100.000%, Cross-Validation Score : 78.015%

model = RandomForestClassifier(n_estimators=100)
predictor_var = ['Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Loan_Amount_Term', 'Credit_History', 'Property_Area',
        'LoanAmount_log','TotalIncome_log', 'LoanAmount/TotalIncome']
classification_model(model, df,predictor_var,outcome_var)
# Accuracy : 100.000%, Cross-Validation Score : 78.339%


# Let's try to correct overfitting problem

#Create a series with feature importances:
featimp = pd.Series(model.feature_importances_, index=predictor_var).sort_values(ascending=False)
print (featimp)

# Let’s use the top 5 variables for creating a model. Also, we will modify the parameters 
# of random forest model a little bit:
model = RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth=7, max_features=1)
predictor_var = ['Credit_History', 'LoanAmount/TotalIncome', 'TotalIncome_log','LoanAmount_log', 'Dependents']
classification_model(model, df,predictor_var,outcome_var)
# Accuracy : 82.573%, Cross-Validation Score : 80.292%

model = RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth=7, max_features=1)
predictor_var = ['Credit_History', 'LoanAmount/TotalIncome', 'TotalIncome_log','LoanAmount_log', 'Dependents', 'Property_Area']
classification_model(model, df,predictor_var,outcome_var)
# Accuracy : 83.876%, Cross-Validation Score : 80.620%

model = RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth=7, max_features=1)
predictor_var = ['Credit_History', 'LoanAmount/TotalIncome', 'Dependents', 'Property_Area']
classification_model(model, df,predictor_var,outcome_var)
# Accuracy : 83.876%, Cross-Validation Score : 81.109%
# Best Cross-Validation Score!!!