# Logistic regression with loans

Loans data captured information on loan repayment by individuals. We have 13 independent variables and 1 dependent variable called not.fully.paid. Build a logistic regreession model to predict if an individual is a loan defaulter or not.

In [1]:
loans_data <- read.csv("datasets/loans.csv")

In [2]:
str(loans_data)

'data.frame':	9578 obs. of  14 variables:
 $ credit.policy    : int  1 1 1 1 1 1 1 1 1 1 ...
 $ purpose          : Factor w/ 7 levels "all_other","credit_card",..: 3 2 3 3 2 2 3 1 5 3 ...
 $ int.rate         : num  0.119 0.107 0.136 0.101 0.143 ...
 $ installment      : num  829 228 367 162 103 ...
 $ log.annual.inc   : num  11.4 11.1 10.4 11.4 11.3 ...
 $ dti              : num  19.5 14.3 11.6 8.1 15 ...
 $ fico             : int  737 707 682 712 667 727 667 722 682 707 ...
 $ days.with.cr.line: num  5640 2760 4710 2700 4066 ...
 $ revol.bal        : int  28854 33623 3511 33667 4740 50807 3839 24220 69909 5630 ...
 $ revol.util       : num  52.1 76.7 25.6 73.2 39.5 51 76.8 68.6 51.1 23 ...
 $ inq.last.6mths   : int  0 0 1 1 0 0 0 0 1 1 ...
 $ delinq.2yrs      : int  0 0 0 0 1 0 0 0 0 0 ...
 $ pub.rec          : int  0 0 0 0 0 0 1 0 0 0 ...
 $ not.fully.paid   : int  0 0 0 0 0 0 1 1 0 0 ...


In [3]:
# dummy encode the purpose column
purposeMatrix <- model.matrix(~ purpose -1, data = loans_data)

In [4]:
# cbind it to loans_data
loans_data <- cbind(loans_data, purposeMatrix)

In [5]:
t(head(loans_data))

Unnamed: 0,1,2,3,4,5,6
credit.policy,1,1,1,1,1,1
purpose,debt_consolidation,credit_card,debt_consolidation,debt_consolidation,credit_card,credit_card
int.rate,0.1189,0.1071,0.1357,0.1008,0.1426,0.0788
installment,829.10,228.22,366.86,162.34,102.92,125.13
log.annual.inc,11.35041,11.08214,10.37349,11.35041,11.29973,11.90497
dti,19.48,14.29,11.63,8.10,14.97,16.98
fico,737,707,682,712,667,727
days.with.cr.line,5639.958,2760.000,4710.000,2699.958,4066.000,6120.042
revol.bal,28854,33623,3511,33667,4740,50807
revol.util,52.1,76.7,25.6,73.2,39.5,51.0


In [6]:
# don't need the purpose column anymore
loans_data$purpose <- NULL

In [7]:
t(head(loans_data))

Unnamed: 0,1,2,3,4,5,6
credit.policy,1.0,1.0,1.0,1.0,1.0,1.0
int.rate,0.1189,0.1071,0.1357,0.1008,0.1426,0.0788
installment,829.1,228.22,366.86,162.34,102.92,125.13
log.annual.inc,11.35041,11.08214,10.37349,11.35041,11.29973,11.90497
dti,19.48,14.29,11.63,8.1,14.97,16.98
fico,737.0,707.0,682.0,712.0,667.0,727.0
days.with.cr.line,5639.95833,2760.0,4710.0,2699.95833,4066.0,6120.04167
revol.bal,28854.0,33623.0,3511.0,33667.0,4740.0,50807.0
revol.util,52.1,76.7,25.6,73.2,39.5,51.0
inq.last.6mths,0.0,0.0,1.0,1.0,0.0,0.0


In [8]:
library(plyr)

paidFreq <- count(loans_data, "not.fully.paid")
paidFreq

not.fully.paid,freq
0,8045
1,1533


In [9]:
notPaid <- paidFreq$freq[paidFreq$not.fully.paid == 1]
Paid <- paidFreq$freq[paidFreq$not.fully.paid == 0]

paste("Not fully paid:", notPaid)
paste("Fully paid:", Paid)
paste("Not Fully Paid : Fully Paid = ", notPaid/Paid)

In [10]:
library(caTools)

split <- sample.split(loans_data$not.fully.paid,
                         SplitRatio = 0.70)

train_data <- subset(loans_data, split == TRUE)
test_data <- subset(loans_data, split == FALSE)

In [11]:
dim(train_data)

In [12]:
dim(test_data)

In [13]:
log1 <- glm(not.fully.paid ~ .,
               data = loans_data,
               family = binomial)

summary(log1)


Call:
glm(formula = not.fully.paid ~ ., family = binomial, data = loans_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2956  -0.6177  -0.4970  -0.3676   2.6118  

Coefficients: (1 not defined because of singularities)
                            Estimate Std. Error z value Pr(>|z|)    
(Intercept)                9.126e+00  1.312e+00   6.954 3.54e-12 ***
credit.policy             -3.289e-01  8.419e-02  -3.906 9.37e-05 ***
int.rate                   1.534e+00  1.727e+00   0.888  0.37461    
installment                1.208e-03  1.750e-04   6.900 5.18e-12 ***
log.annual.inc            -4.093e-01  5.959e-02  -6.869 6.47e-12 ***
dti                       -3.235e-04  4.579e-03  -0.071  0.94368    
fico                      -9.002e-03  1.421e-03  -6.337 2.34e-10 ***
days.with.cr.line          1.303e-05  1.330e-05   0.979  0.32735    
revol.bal                  3.026e-06  9.505e-07   3.184  0.00145 ** 
revol.util                 2.401e-03  1.277e-03   1.880  0.06

In [14]:
predict_train1 <- glm(not.fully.paid ~ credit.policy + installment + log.annual.inc + 
                    fico + revol.bal + inq.last.6mths + pub.rec,
                    data = train_data,
                    family = binomial)

summary(predict_train1)


Call:
glm(formula = not.fully.paid ~ credit.policy + installment + 
    log.annual.inc + fico + revol.bal + inq.last.6mths + pub.rec, 
    family = binomial, data = train_data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1590  -0.6104  -0.5034  -0.3781   2.5621  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)     8.972e+00  1.011e+00   8.872  < 2e-16 ***
credit.policy  -3.908e-01  1.003e-01  -3.898 9.71e-05 ***
installment     1.247e-03  1.856e-04   6.718 1.84e-11 ***
log.annual.inc -4.026e-01  6.864e-02  -5.866 4.45e-09 ***
fico           -9.426e-03  1.093e-03  -8.625  < 2e-16 ***
revol.bal       4.256e-06  1.126e-06   3.781 0.000156 ***
inq.last.6mths  1.040e-01  1.664e-02   6.250 4.11e-10 ***
pub.rec         2.594e-01  1.147e-01   2.262 0.023689 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 5896.6  on 6704  degrees o

In [15]:
predicted.risk <- predict(predict_train1, 
                             type = "response",
                             newdata = test_data)

predicted.risk[0:10]

In [16]:
preds <- ifelse(predicted.risk > 0.5, 1, 0)
table(test_data$not.fully.paid, preds)

   preds
       0    1
  0 2398   15
  1  452    8

In [17]:
misClassify <- mean(preds != test_data$not.fully.paid)
paste("Accuracy", 1 - misClassify)

# OR....

manual <- (2397 + 9) / (2397 + 16 + 451 + 9)
paste(manual)

In [20]:
# It should predict 0 = Fully Paid all the time. The baseline accuracy is:

paste("baseline accuracy:", 8045/(8045 + 1533))

In [25]:
library(ROCR)


r <- prediction(predicted.risk, test_data$not.fully.paid)
auc <- as.numeric(performance(r, "auc")@y.values)
paste("AUC:", auc)