In [None]:
## Importing packages

# This R environment comes with all of CRAN and many other helpful packages preinstalled.
# You can see which packages are installed by checking out the kaggle/rstats docker image: 
# https://github.com/kaggle/docker-rstats

library(tidyverse) # metapackage with lots of helpful functions

## Running code

# In a notebook, you can run a single code cell by clicking in the cell and then hitting 
# the blue arrow to the left, or by clicking in the cell and pressing Shift+Enter. In a script, 
# you can run code by highlighting the code you want to run and then clicking the blue arrow
# at the bottom of this window.

## Reading in files

# You can access files from datasets you've added to this kernel in the "../input/" directory.
# You can see the files added to this kernel by running the code below. 

list.files(path = "../input")

## Saving data

# If you save any files or images, these will be put in the "output" directory. You 
# can see the output directory by committing and running your kernel (using the 
# Commit & Run button) and then checking out the compiled version of your kernel.

In [None]:
application_train <- read.csv('../input/application_train_clean_SMOTEd.csv',1)
dim(application_train)

In [None]:
str(application_train)

In [None]:
names(application_train)

In [None]:
# convert categorical attributes to factor type
data_cat <- application_train[,c(1:49)]
data_num <- application_train[,-c(1:49)]
data_cat <- data.frame(apply(data_cat, 2,function(x){as.factor(x)}))
data_num <- data.frame(apply(data_num, 2,function(x){as.numeric(x)}))
application_train <- cbind(data_cat,data_num)
str(application_train)

In [None]:
summary(application_train)

In [None]:
application_train <- application_train[,-c(12,38,40,69,70)] # drop insignificant features
# drop minority observations
application_train <- subset(application_train, !(application_train$CODE_GENDER == "X"))
application_train$CODE_GENDER<-factor(application_train$CODE_GENDER)
application_train <- subset(application_train, !(application_train$NAME_FAMILY_STATUS == "Unknown"))
application_train$NAME_FAMILY_STATUS<-factor(application_train$NAME_FAMILY_STATUS)

In [None]:
# class distribution
table(application_train$TARGET)

In [None]:
# Split Data into Train and test - 70:30
library(caret)
set.seed(7)
train_test_split<-createDataPartition(application_train$TARGET, p=0.7, list = FALSE)
train_data<-application_train[train_test_split,]
test_data<-application_train[-train_test_split,]

In [None]:
# train - class distribution
table(train_data$TARGET)

In [None]:
# test - class distribution
table(test_data$TARGET)

In [None]:
library(gbm)
gbm.model<-gbm(TARGET ~ . , data = train_data[,-c(1)], distribution = "bernoulli", n.trees = 500, 
               cv.folds = 3, shrinkage = 0.01, interaction.depth = 2)
print(gbm.model)

In [None]:
# Check performance using the out-of-bag (OOB) error; the OOB error typically
# underestimates the optimal number of iterations
#best.iter <- gbm.perf(gbm.model, method = "OOB")
# Check performance using the 50% heldout test set
best.iter <- gbm.perf(gbm.model, method = "test", plot.it = FALSE)
# Check the best iteration number
#best.iter <- gbm.perf(gbm.model, method="cv")
best.iter

In [None]:
# Plot relative influence of each variable
par(mfrow = c(1, 2))
summary(gbm.model, n.trees = 1, plot.it = FALSE)          # using first tree
summary(gbm.model, n.trees = best.iter, plot.it = FALSE)  # using estimated best number of trees

In [None]:
# Plots the marginal effect of the selected variables by "integrating" out the other variables
plot.gbm(gbm.model, 1, best.iter)

In [None]:
plot.gbm(gbm.model, 2, best.iter)

In [None]:
plot.gbm(gbm.model, 3, best.iter)

In [None]:
plot.gbm(gbm.model, 4, best.iter)

In [None]:
plot.gbm(gbm.model, 5, best.iter)

In [None]:
#set.seed(7)
#fitControl = trainControl(method="cv", number=2)

#gbm.model = train(TARGET~., data=train_data[,-c(1)], method="gbm",distribution="bernoulli", trControl=fitControl,
#                  metric = "ROC", verbose=FALSE,
#                  tuneGrid=data.frame(.n.trees=best.iter, .shrinkage=0.01, .interaction.depth=1, .n.minobsinnode=1))
#gbm.model

In [None]:
pred <- predict(gbm.model, train_data[,-c(1)], n.trees = best.iter, na.action = na.pass)
postResample(pred, train_data$TARGET)

In [None]:
confusionMatrix(pred, train_data$TARGET)

In [None]:
pred <- predict(gbm.model, test_data[,-c(1)], n.trees = best.iter, na.action = na.pass)
postResample(pred, test_data$TARGET)

In [None]:
confusionMatrix(pred, test_data$TARGET)

In [None]:
results <- predict(gbm.model, test_data[,-c(1)], n.trees = best.iter, na.action = na.pass, type = "prob")
results$obs = train_data$TARGET
head(results)

In [None]:
mnLogLoss(results, lev = levels(results$obs))

In [None]:
results$pred <- predict(gbm.model, test_data[,-c(1)], n.trees = best.iter, na.action = na.pass)
multiClassSummary(results, lev = levels(results$obs))

In [None]:
# Prediction and Calculate Performance Metrics
predictions <- predict(gbm.model,test_data[,-c(1)], n.trees = best.iter)
library(pROC)
roc_curve <- roc(test_data$TARGET,as.numeric(predictions),plot=TRUE)
auc(roc_curve)