In [None]:
library("gridExtra")
library("DataExplorer")
library(ggplot2)
library(tidyverse)

### EDA

In [None]:
# Numerical - Categorical
p1 <- ggplot(data, aes(x = Administrative, fill = Revenue)) + 
  geom_density(alpha = 0.3)

p2 <- ggplot(data, aes(x = Administrative_Duration, fill = Revenue)) +
  geom_density(alpha = 0.3)

p3 <- ggplot(data, aes(x = Informational, fill = Revenue)) +
  geom_density(alpha = 0.3)

p4 <- ggplot(data, aes(x = Informational_Duration, fill = Revenue)) +
  geom_density(alpha = 0.3)

p5 <- ggplot(data, aes(x = ProductRelated, fill = Revenue)) +
  geom_density(alpha = 0.3) 

p6 <- ggplot(data, aes(x = ProductRelated_Duration, fill = Revenue)) +
  geom_density(alpha = 0.3)

p7 <- ggplot(data, aes(x = BounceRates, fill = Revenue)) +
  geom_density(alpha = 0.3)

p8 <- ggplot(data, aes(x = ExitRates, fill = Revenue)) +
  geom_density(alpha = 0.3)

p9 <- ggplot(data, aes(x = PageValues, fill = Revenue)) +
  geom_density(alpha = 0.3)

p10 <- ggplot(data, aes(x = SpecialDay, fill = Revenue)) +
  geom_density(alpha = 0.3)

grid.arrange(p1,p2,p3,p4,p5,p6, nrow = 3, ncol = 2, top = "Conditional distributions given Revenue")
grid.arrange(p7,p8,p9,p10, nrow = 2, ncol = 2)

In [None]:
# Categorical - Categorical
p11 <- ggplot(data, aes(x = Month, fill = Revenue)) + 
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Pastel2") +
  ylab("proportion") +
  theme_minimal()

p12 <- ggplot(data, aes(x = factor(OperatingSystems), fill = Revenue)) + 
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Pastel2") +
  xlab("Operating Systems") +
  ylab("proportion") +
  theme_minimal()

p13 <- ggplot(data, aes(x = factor(Browser), fill = Revenue)) + 
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Pastel2") +
  xlab("Browser") +
  ylab("proportion") +
  theme_minimal()

p14 <- ggplot(data, aes(x = factor(Region), fill = Revenue)) + 
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Pastel2") +
  xlab("Region") +
  ylab("proportion") +
  theme_minimal()

p15 <- ggplot(data, aes(x = factor(TrafficType), fill = Revenue)) + 
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Pastel2") +
  xlab("Traffic Type") +
  ylab("proportion") +
  theme_minimal()

p16 <- ggplot(data, aes(x = VisitorType, fill = Revenue)) + 
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Pastel2") +
  xlab("Visitor Type") +
  ylab("proportion") +
  theme_minimal()

p17 <- ggplot(data, aes(x = Weekend, fill = Revenue)) + 
  geom_bar(position = "fill") +
  scale_fill_brewer(palette = "Pastel2") +
  ylab("proportion") +
  theme_minimal()

grid.arrange(p11,p12,p13,p14,p15,p16,p17, nrow = 4, ncol = 2, top = "Conditional distributions given Revenue")

### Logistic Regression

In [None]:
## Performing Variable Selection:
# Stepwise Logistic Regression:
logR <- step(glm(Revenue~., data = train, family = binomial), 
             direction = "both", trace=0)

## Prediction
summary(logR)

In [None]:
#load the car library
library(car)

#calculate the VIF for each predictor variable in the model
vif(logR)

In [None]:
glm.probs = predict(logR,type = "response")
glm.pred = rep("FALSE", dim(train)[1])
glm.pred[glm.probs > 0.5] = "TRUE"
conf_matrix_train = addmargins(table(glm.pred, train$Revenue, dnn = c("Predicted Class","True Class")))
conf_matrix_train

misc_logR_train = round((conf_matrix_train[1,2] + conf_matrix_train[2,1])/conf_matrix_train[3,3] * 100, 2)
misc_logR_train # Misclassification error on training set (%)

tpr = round(conf_matrix_train[2,2]/conf_matrix_train[3,2]*100, 2) # TPR (%)
tpr

fpr = round(conf_matrix_train[2,1]/conf_matrix_train[3,1]*100, 2) # FPR (%)
fpr

round((100 - tpr), 2) # false negative error (%)

In [None]:
library(gridExtra)      # arrange multiple grid-based plots on a page
library(ROCR)           # ROC curv

In [None]:
# ROC Curve
logR_performance = performance(prediction(glm.probs, train$Revenue), measure = "tpr", x.measure = "fpr")
plot(logR_performance, colorize = TRUE, lwd = 2, print.cutoffs.at = c(0.2,0.3,0.4,0.5,0.8))
abline(a = 0, b = 1, lty = 2)
logR_auc = performance(prediction(glm.probs, train$Revenue), measure = "auc", x.measure = "fpr")@y.values[[1]]
logR_auc

In [None]:
# Test data
sum(complete.cases(test))
sum(!complete.cases(test))

glm.probs.val = predict(logR, test, type = "response")
glm.pred.val = rep("FALSE", dim(test)[1])
glm.pred.val[glm.probs.val > 0.5] = "TRUE"
conf_matrix_val = addmargins(table(glm.pred.val, test$Revenue, dnn = c("Predicted Class","True Class")))
conf_matrix_val

misc_logR_val = round((conf_matrix_val[1,2] + conf_matrix_val[2,1])/conf_matrix_val[3,3] * 100, 2)
misc_logR_val