In [None]:
## Importing packages

# This R environment comes with all of CRAN and many other helpful packages preinstalled.
# You can see which packages are installed by checking out the kaggle/rstats docker image: 
# https://github.com/kaggle/docker-rstats

library(tidyverse) # metapackage with lots of helpful functions

## Running code

# In a notebook, you can run a single code cell by clicking in the cell and then hitting 
# the blue arrow to the left, or by clicking in the cell and pressing Shift+Enter. In a script, 
# you can run code by highlighting the code you want to run and then clicking the blue arrow
# at the bottom of this window.

## Reading in files

# You can access files from datasets you've added to this kernel in the "../input/" directory.
# You can see the files added to this kernel by running the code below. 

list.files(path = "../input")

## Saving data

# If you save any files or images, these will be put in the "output" directory. You 
# can see the output directory by committing and running your kernel (using the 
# Commit & Run button) and then checking out the compiled version of your kernel.

In [None]:
application_train <- read.csv('../input/application_train_clean_SMOTEd.csv',1)
dim(application_train)

In [None]:
names(application_train)

In [None]:
# convert categorical attributes to factor type
data_cat <- application_train[,c(1:49)]
data_num <- application_train[,-c(1:49)]
data_cat <- data.frame(apply(data_cat, 2,function(x){as.factor(x)}))
data_num <- data.frame(apply(data_num, 2,function(x){as.numeric(x)}))
application_train <- cbind(data_cat,data_num)
str(application_train)

In [None]:
# class distribution
table(application_train$TARGET)

In [None]:
summary(application_train)

1. Notice that the following categorical features have all its data in just 1 class. Let's drop them from the model.

FLAG_DOCUMENT_12

0:322722

1:     3

FLAG_DOCUMENT_10

0:322721

1:     4

FLAG_MOBIL

0:     1

1:307510

2. AMT_REQ_CREDIT_BUREAU_HOUR & AMT_REQ_CREDIT_BUREAU_DAY are all zero
3. CODE_GENDER has got only 6 observations for level 'X'
4. NAME_FAMILY_STATUS has got only 1 observation for level 'Unknown'

In [None]:
application_train <- application_train[,-c(12,38,40,69,70)] # drop insignificant features
# drop minority observations
application_train <- subset(application_train, !(application_train$CODE_GENDER == "X"))
application_train$CODE_GENDER<-factor(application_train$CODE_GENDER)
application_train <- subset(application_train, !(application_train$NAME_FAMILY_STATUS == "Unknown"))
application_train$NAME_FAMILY_STATUS<-factor(application_train$NAME_FAMILY_STATUS)

In [None]:
# Split Data into Train and test - 70:30
library(caret)
set.seed(7)
train_test_split<-createDataPartition(application_train$TARGET, p=0.7, list = FALSE)
train_data<-application_train[train_test_split,]
test_data<-application_train[-train_test_split,]

In [None]:
# Logistic Regression Model - all features
logit_model1<-glm(TARGET~., data = train_data[,-c(1)], family = binomial(link = 'logit'))
summary(logit_model1)

In [None]:
# Feature Selection / Significance
library(car)
#** Multi-collinearity check
vif(logit_model1)

In [None]:
# Feature Selection / Significance
#** run anova
anova(logit_model1, test = 'Chisq')

In [None]:
library(lmtest)
# Log Likelihood Test
lrtest(logit_model1)

In [None]:
library(pscl)
# McFadden Pseudo RSquare Test
pR2(logit_model1)

In [None]:
# Odds Ratio
odd_model<-exp(coef(logit_model1))
odd_model

In [None]:
# Predict the outcome - train
predict_prob<-predict(logit_model1,train_data[,-c(1,2)], type="response")
predicted_response<-ifelse(predict_prob>0.5,1,0) 
predicted_response<-as.factor(predicted_response)
##Confusion Matrix
confusionMatrix(predicted_response,train_data$TARGET)

In [None]:
library(ROCR)
# ROC - training data
pred<-predict(logit_model1,train_data[,-c(1,2)], type='response')
pred<-prediction(pred, train_data$TARGET)
roc<-performance(pred,"tpr", "fpr")
plot(roc)
auc<-performance(pred,"auc")
auc
auc<-unlist(slot(auc,"y.values"))
auc

In [None]:
# Predict the outcome - test
predict_prob<-predict(logit_model1,test_data[,-c(1,2)], type="response")
predicted_response<-ifelse(predict_prob>0.5,1,0) 
predicted_response<-as.factor(predicted_response)
##Confusion Matrix
confusionMatrix(predicted_response,test_data$TARGET)

In [None]:
# ROC - test data
pred<-predict(logit_model1,test_data[,-c(1,2)], type='response')
pred<-prediction(pred, test_data$TARGET)
roc<-performance(pred,"tpr", "fpr")
plot(roc)
auc<-performance(pred,"auc")
auc
auc<-unlist(slot(auc,"y.values"))
auc