In [1]:
require(caret)
library(kernlab)
data(spam)

Loading required package: caret
Loading required package: lattice
Loading required package: ggplot2


In [None]:
inTrain <- createDataPartition(y=spam$type,p=0.75,list=F)
training <- spam[inTrain,]
testing <- spam[-inTrain,]

In [None]:
set.seed(123)
modelFit <- train(type~.,data=training,method = "glm")
modelFit
# To see the fitten values for the above model
modelFit$finalModel

In [None]:
predictions <- predict(modelFit,newdata = testing)
confusionMatrix(predictions,testing$type)

In [None]:
# Creating K-folds
folds <- createFolds(y=spam$type,k=10,list=T,returnTrain = T)
sapply(folds,length)

In [None]:
# using resampling techniques, with replacement , check if without replacement is available
folds <- createResample(y = spam$type, times=10,list = T)
sapply(folds,length)

In [None]:
# Data pre-processing
# Standarding all the variables in the data
preObj <- preProcess(training, method = ("center","scale"))
trainCaps <- predict(preObj, training)$capitalAve
testCaps <- predict(preObj, testing)$capitalAve

In [None]:
# Imputing the data
# We can use KNNImputation
prePbj <- preProcess(training, method = "knnImpute")
capAve <- predict(preObj, training)$capitalAve

In [None]:
# Pre-processing with PCA
# Often times you have multiple quantitative variables, which are highly co-related with each other or similar (almost/exactly same)
# Use the data which captures the maximum information
M <- abs(cor(training[,-58])) #58 is outcome variable
diag(M)<-0
which(M>0.8,arr.ind = T)

In [None]:
# Basic idea of PCA
#  We might not need every predictor variable 
#  A weighted combination of predictors might be useful
#  We should pick this combination to capture the "most information" possible
#  Benifits:
#           reduced number of predictors
#           Reduced noise

In [None]:
preProc <- preProcess(training, method = "pca",pcaComp = 2)
trainPC <- predict(preProc,training)
testPC <- predict(preProc,test)
modelFit <- train(training$churn ~ .,method = "glm",data = trainPC)
# simply put this
modelFit <- train(training$churn ~.,method = "glm",preProcess = "pca", data = training)
confusionMatrix(testing$churn, predict(modelFit,testing))