In [None]:
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
df = pd.read_csv('loan.csv')

In [None]:
df.head(5)

In [None]:
## Are the assumptions met?
## Sample size? 1000
## Outliers? Yes
df.describe()

# This is the Sklearn Method

In [None]:
X = df[['Credit_score']]
y = df['Default']
model = LogisticRegression().fit(X,y)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
model.predict([[500]])[0]

In [None]:
model.predict_proba([[500]])

# This is the Stats Models Method

In [None]:
## Let's start with a simple logistic model with Credit Score as our X and Default as the Y
X = df[['Credit_score']]
X = sm.add_constant(X)
y = df[['Default']]

In [None]:
model = sm.Logit(y, X).fit()
y_pred = model.predict(X)
resid = y - y_pred
print(model.summary())

In [None]:
plt.scatter(df['Credit_score'],y_pred)
plt.axhline(y=0, color='r', linestyle='-')
plt.axhline(y=1, color='r', linestyle='-')
plt.show()

In [None]:
model.predict([[1,500]])

# Let's Create a Full Logistic Regression Model

First we want to transform our categorical variables into dummy variables

In [None]:
df['female']=pd.get_dummies(df['Gender'])['Female']
df['married']=pd.get_dummies(df['Marital_status '])['Married']
df['employed']=pd.get_dummies(df['Emp_status'])['employed']

In [None]:
df.head(5)

In [None]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Personal_loan','Home_loan',
        'Education_loan','Amount','Saving_amount','Emp_duration ','Age','No_of_credit_acc','female','married',
       'employed']]
X = sm.add_constant(X)
y = df[['Default']]

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)

Looks like Personal_loan has some high collinearity with the other variables. Let's drop it and check again

In [None]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Home_loan',
        'Education_loan','Amount','Saving_amount','Emp_duration ','Age','No_of_credit_acc','female','married',
       'employed']]
X = sm.add_constant(X)
y = df[['Default']]

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
print(vif_data)

Looks good now! We always will ignore the constant term when looking at the VIFs

In [None]:
model = sm.Logit(y, X).fit()
print(model.summary())

It appears that female is not statistically significant. Let's drop it

In [None]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Home_loan',
        'Education_loan','Amount','Saving_amount','Emp_duration ','Age','No_of_credit_acc','married',
       'employed']]
X = sm.add_constant(X)
y = df[['Default']]
model = sm.Logit(y, X).fit()
print(model.summary())

Married is insignificant. We will drop that one and refit

In [None]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Home_loan',
        'Education_loan','Amount','Saving_amount','Emp_duration ','Age','No_of_credit_acc',
       'employed']]
X = sm.add_constant(X)
y = df[['Default']]
model = sm.Logit(y, X).fit()
print(model.summary())

Employment Duration is insignificant. Drop and refit

In [None]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Home_loan',
        'Education_loan','Amount','Saving_amount','Age','No_of_credit_acc',
       'employed']]
X = sm.add_constant(X)
y = df[['Default']]
model = sm.Logit(y, X).fit()
print(model.summary())

In [None]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Home_loan',
        'Education_loan','Amount','Saving_amount','Age','employed']]
X = sm.add_constant(X)
y = df[['Default']]
model = sm.Logit(y, X).fit()
print(model.summary())

In [None]:
X = df[['Checking_amount','Term','Credit_score','Car_loan','Home_loan',
        'Education_loan','Saving_amount','Age','employed']]
X = sm.add_constant(X)
y = df[['Default']]
model = sm.Logit(y, X).fit()
print(model.summary())

Let's see if Jerry would get approved for a loan. He has 700 in checking, 25 month term, 600 credit score, a car loan, no home loan, an education loan, 2000 in savings, is 40, and unemployed.

In [None]:
jerry = [1,700,25,600,1,0,1,2000,40,0]
model.predict(jerry)

Jane has 1000 in checking, 18 month term on loan, 800 credit score, has a car loan, home loan, and student loan,
3500 in savings, is 39, and is employed

In [None]:
jane = [1,1000,18,800,1,1,1,3500,39,1] 
model.predict(jane)