In [None]:
## Importing packages

# This R environment comes with all of CRAN and many other helpful packages preinstalled.
# You can see which packages are installed by checking out the kaggle/rstats docker image: 
# https://github.com/kaggle/docker-rstats

library(tidyverse) # metapackage with lots of helpful functions

## Running code

# In a notebook, you can run a single code cell by clicking in the cell and then hitting 
# the blue arrow to the left, or by clicking in the cell and pressing Shift+Enter. In a script, 
# you can run code by highlighting the code you want to run and then clicking the blue arrow
# at the bottom of this window.

## Reading in files

# You can access files from datasets you've added to this kernel in the "../input/" directory.
# You can see the files added to this kernel by running the code below. 

list.files(path = "../input")

## Saving data

# If you save any files or images, these will be put in the "output" directory. You 
# can see the output directory by committing and running your kernel (using the 
# Commit & Run button) and then checking out the compiled version of your kernel.

This is an already cleaned dataset.

In [None]:
application_train <- read.csv('../input/application_train_clean_SMOTEd.csv',1)
dim(application_train)

In [None]:
str(application_train)

In [None]:
names(application_train)

In [None]:
# convert categorical attributes to factor type
data_cat <- application_train[,c(1:49)]
data_num <- application_train[,-c(1:49)]
data_cat <- data.frame(apply(data_cat, 2,function(x){as.factor(x)}))
data_num <- data.frame(apply(data_num, 2,function(x){as.numeric(x)}))
application_train <- cbind(data_cat,data_num)
str(application_train)

In [None]:
application_train <- application_train[,-c(12)] # FLAG_MOBIL has got just 1 level

In [None]:
# class distribution
table(application_train$TARGET)

In [None]:
# Split Data into Train and test - 70:30
library(caret)
set.seed(7)
train_test_split<-createDataPartition(application_train$TARGET, p=0.7, list = FALSE)
train_data<-application_train[train_test_split,]
test_data<-application_train[-train_test_split,]

In [None]:
# train - class distribution
table(train_data$TARGET)

In [None]:
# test - class distribution
table(test_data$TARGET)

In [None]:
library(randomForest)
set.seed(7)

In [None]:
rf <- randomForest(TARGET ~ ., data=train_data[,-c(1)], keep.forest=TRUE, mtry=18, importance=TRUE) 
print(rf)

In [None]:
# Plotting OOB error rates
plot(rf, main="")
legend("topright", c("OOB", "0", "1"), text.col=1:6, lty=1:3, col=1:3)
title(main="Error Rates Random Forest Train data")

In [None]:
print(rf$importance)

In [None]:
# Evaluate variable importance
importance(rf)
#why mean? Because there are many trees and this is mean across all of them.
# plot (directly prints the important attributes) 
varImpPlot(rf,
           sort = T,
           main="Variable Importance",
           n.var=30)
# Variable Importance Table
var.imp <- data.frame(importance(rf,type=2))

In [None]:
# Important Features
var.imp$Variables <- row.names(var.imp)
var.imp[order(var.imp$MeanDecreaseGini,decreasing = T),]
imp_feature<-var.imp[order(var.imp$MeanDecreaseGini,decreasing = T),]$Variables

In [None]:
# Find the optimal mtry
#mtry <- tuneRF(train_data[,-c(1)],train_data$TARGET, ntreeTry=30,
#               stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
#best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
#print(mtry)
#print(best.m)

In [None]:
# Prediction and Calculate Performance Metrics
predictions <- predict(rf,train_data[,-c(1,2)],type='response')
library(pROC)
roc_curve <- roc(train_data$TARGET,as.numeric(predictions),plot=TRUE)
auc(roc_curve)

In [None]:
# Predict on Train data 
pred_model_train <-predict(rf,train_data[,-c(1,2)],type="response")
result_train <- table("actual _values"= train_data$TARGET,pred_model_train);result_train
# Accuracy,Precision and Recall on train
train_accuracy <- sum(diag(result_train))/sum(result_train)*100;train_accuracy

In [None]:
# scoring
#train_data$predict.class <- predict(rf, train_data[,-c(1,2)], type="class")
#train_data$predict.score <- predict(rf, train_data[,-c(1,2)], type="prob")
#head(train_data)

In [None]:
# Predict on test data 
pred_model_test <-predict(rf,test_data[,-c(1,2)],type="response")
result_test <- table("actual _values"= test_data$TARGET,pred_model_test);result_test
# Accuracy,Precision and Recall on train
test_accuracy <- sum(diag(result_test))/sum(result_test)*100;test_accuracy

In [None]:
# scoring
#test_data$predict.class <- predict(rf, test_data[,-c(1,2)], type="class")
#test_data$predict.score <- predict(rf, test_data[,-c(1,2)], type="prob")
#head(test_data)

In [None]:
# Prediction and Calculate Performance Metrics
predictions <- predict(rf,test_data[,-c(1,2)],type='response')
roc_curve <- roc(test_data$TARGET,as.numeric(predictions),plot=TRUE)
auc(roc_curve)