In [74]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()

## Importing data set 

In [8]:
readRDS = robjects.r['readRDS']
df = readRDS('loan_data.rds')
df = pandas2ri.ri2py(df)

## Describing data set 

In [9]:
df.describe(include=[np.object])

Unnamed: 0,grade,home_ownership,emp_cat,ir_cat
count,29091,29091,29091,29091
unique,7,4,5,5
top,A,RENT,0-15,0-8
freq,9649,14692,25642,7130


In [10]:
df.describe(include=[np.number])

Unnamed: 0,loan_status,loan_amnt,annual_inc,age
count,29091.0,29091.0,29091.0,29091.0
mean,0.110928,9593.663848,66964.89,27.69812
std,0.314048,6323.467488,53253.18,6.194617
min,0.0,500.0,4000.0,20.0
25%,0.0,5000.0,40000.0,23.0
50%,0.0,8000.0,56400.0,26.0
75%,0.0,12250.0,80000.0,30.0
max,1.0,35000.0,2039784.0,94.0


In [118]:
l = df.select_dtypes(exclude=[object]).shape[1]
n = int(np.floor(np.sqrt(l)))
m = int(np.ceil(l / n))

In [123]:
ax.flatten()

array([<matplotlib.axes._subplots.AxesSubplot object at 0x7ffaa37a5b70>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7ffaa37cee10>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7ffaa377f4e0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7ffaa3725b70>], dtype=object)

In [134]:
f, ax = plt.subplots(n,m)
ax = ax.flatten()
i = 0
for col in df.select_dtypes(exclude=[object]).iteritems():
    col[1].plot(kind='box', ax = ax[i])
    i = i + 1
plt.tight_layout()

<IPython.core.display.Javascript object>

In [67]:
df.groupby("loan_status").aggregate([np.mean, np.std])

Unnamed: 0_level_0,loan_amnt,loan_amnt,annual_inc,annual_inc,age,age
Unnamed: 0_level_1,mean,std,mean,std,mean,std
loan_status,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,9619.234264,6307.620056,67937.626557,54029.038933,27.733955,6.208826
1,9388.720174,6446.406294,59168.548119,45837.568727,27.410908,6.072846


## Getting dummy variables

In [142]:
df = pd.get_dummies(df)

In [143]:
df.columns

Index(['loan_status', 'loan_amnt', 'annual_inc', 'age', 'grade_A', 'grade_B',
       'grade_C', 'grade_D', 'grade_E', 'grade_F', 'grade_G',
       'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN',
       'home_ownership_RENT', 'emp_cat_0-15', 'emp_cat_15-30', 'emp_cat_30-45',
       'emp_cat_45+', 'emp_cat_Missing', 'ir_cat_0-8', 'ir_cat_11-13.5',
       'ir_cat_13.5+', 'ir_cat_8-11', 'ir_cat_Missing'],
      dtype='object')

In [100]:
table = pd.crosstab(df["grade"], df["loan_status"], normalize=True)
#table.div(table.sum(1), axis=0).plot(kind='bar', stacked=True)

In [13]:
from sklearn.linear_model import LogisticRegressionCV

In [14]:
model = LogisticRegressionCV()


In [150]:
list(df.columns)

['loan_status',
 'loan_amnt',
 'annual_inc',
 'age',
 'grade_A',
 'grade_B',
 'grade_C',
 'grade_D',
 'grade_E',
 'grade_F',
 'grade_G',
 'home_ownership_MORTGAGE',
 'home_ownership_OTHER',
 'home_ownership_OWN',
 'home_ownership_RENT',
 'emp_cat_0-15',
 'emp_cat_15-30',
 'emp_cat_30-45',
 'emp_cat_45+',
 'emp_cat_Missing',
 'ir_cat_0-8',
 'ir_cat_11-13.5',
 'ir_cat_13.5+',
 'ir_cat_8-11',
 'ir_cat_Missing']

In [154]:
target = ["loan_status"]
features = [x for x in list(df.columns) if x not in target]

In [158]:
X = df[features]
y = np.ravel(df[target])

In [159]:
model.fit(X, y)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [166]:
X.iloc[1]

loan_amnt                   2400.0
annual_inc                 12252.0
age                           31.0
grade_A                        0.0
grade_B                        0.0
grade_C                        1.0
grade_D                        0.0
grade_E                        0.0
grade_F                        0.0
grade_G                        0.0
home_ownership_MORTGAGE        0.0
home_ownership_OTHER           0.0
home_ownership_OWN             0.0
home_ownership_RENT            1.0
emp_cat_0-15                   0.0
emp_cat_15-30                  1.0
emp_cat_30-45                  0.0
emp_cat_45+                    0.0
emp_cat_Missing                0.0
ir_cat_0-8                     0.0
ir_cat_11-13.5                 0.0
ir_cat_13.5+                   0.0
ir_cat_8-11                    0.0
ir_cat_Missing                 1.0
Name: 2, dtype: float64

In [170]:
model.predict_proba(X.iloc[1].values.reshape(1,-1))

array([[ 0.86188899,  0.13811101]])

In [172]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=y, y_pred=model.predict(X))

array([[25864,     0],
       [ 3227,     0]])

[Python implementation reference](https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8)