In [None]:
## Importing packages

# This R environment comes with all of CRAN and many other helpful packages preinstalled.
# You can see which packages are installed by checking out the kaggle/rstats docker image: 
# https://github.com/kaggle/docker-rstats

library(tidyverse) # metapackage with lots of helpful functions

## Running code

# In a notebook, you can run a single code cell by clicking in the cell and then hitting 
# the blue arrow to the left, or by clicking in the cell and pressing Shift+Enter. In a script, 
# you can run code by highlighting the code you want to run and then clicking the blue arrow
# at the bottom of this window.

## Reading in files

# You can access files from datasets you've added to this kernel in the "../input/" directory.
# You can see the files added to this kernel by running the code below. 

list.files(path = "../input")

## Saving data

# If you save any files or images, these will be put in the "output" directory. You 
# can see the output directory by committing and running your kernel (using the 
# Commit & Run button) and then checking out the compiled version of your kernel.

In [None]:
application_train <- read.csv('../input/application_train_clean_SMOTEd.csv',1)
dim(application_train)

In [None]:
str(application_train)

In [None]:
names(application_train)

In [None]:
# convert categorical attributes to factor type
data_cat <- application_train[,c(1:49)]
data_num <- application_train[,-c(1:49)]
data_cat <- data.frame(apply(data_cat, 2,function(x){as.factor(x)}))
data_num <- data.frame(apply(data_num, 2,function(x){as.numeric(x)}))
application_train <- cbind(data_cat,data_num)
str(application_train)

In [None]:
summary(application_train[,-c(1:49)])

AMT_REQ_CREDIT_BUREAU_HOUR & AMT_REQ_CREDIT_BUREAU_DAY are all zero

In [None]:
library(caret)

In [None]:
# Feature Selection - 1
#** Identify Redundant Features
set.seed(7)
library(mlbench)
# calculate correlation matrix
correlationMatrix <- cor(application_train[,-c(1:49,69,70)])
# summarize the correlation matrix
print(correlationMatrix)
library(corrplot)
opar2 <- par(no.readonly = TRUE)
corrplot(correlationMatrix,method = "circle",tl.cex = 0.5,tl.col = "black",number.cex = 0.55,bg = "grey14",
         addgrid.col = "gray50", tl.offset = 2,col = colorRampPalette(c("blue1","ivory2","firebrick2"))(100))
# find attributes that are highly corrected (ideally >0.75)
highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.75)
# print indexes of highly correlated attributes
print(highlyCorrelated)
print(names(application_train[,c(highlyCorrelated)]))

In [None]:
application_train <- application_train[,-c(12,38,40,69,70)] # FLAG_MOBIL has got just 1 level; 
#FLAG_DOCUMENT_10 & FLAG_DOCUMENT_12 are almost constants; 
#AMT_REQ_CREDIT_BUREAU_HOUR & AMT_REQ_CREDIT_BUREAU_DAY are constants

In [None]:
# Configure parallel processing
library(parallel)
library(doParallel)
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)

In [None]:
# Feature Selection - 2
#** Features ranked by Importance
set.seed(7)
# prepare training scheme
control <- trainControl(method="cv", number=5, allowParallel = TRUE)
# train the model - Learning Vector Quantization (lvq)
model <- train(TARGET~., data=application_train[,-c(1)], method="lvq", preProcess="scale", trControl=control)
# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
print(importance)
# plot importance
plot(importance)

In [None]:
# Feature Selection - 3
#** Recursive Feature Elimination
#set.seed(7)
# define the control using a random forest selection function
#control <- rfeControl(functions=rfFuncs, method="cv", number=5)
# run the RFE algorithm
#results <- rfe(application_train[,-c(1,2)], application_train[,c(2)], sizes=c(3:74), rfeControl=control)
# summarize the results
#print(results)
# list the chosen features
#predictors(results)
# plot the results
#plot(results, type=c("g", "o"))

In [None]:
# Feature Selection - 4
# PCA

In [None]:
stopCluster(cluster)