In [None]:
# For this project, we are analyzing a dataset using R. The dataset I will be analyzing is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls to clients. I got this dataset through UCI Machine Learning Repository to find a dataset that meets the requirements of the project. The goal of this dataset is to predict if the client will subscribe a term deposit (variable y). A term deposit is a cash investment held at a financial institution over a period of time. This is relevant because it predicts how effective some methods are for increasing the possibility for subscribing to a term deposit. 
  
 # I used a portion of this dataset with 4120 observations. There are a total of 21 variables
#including age, job, marital status, education, if the client has credit in default, has a housing loan, has a personal loan, type of contact, last contact month of year, day of week, duration of phone call, campaign, number of days passed from previous campaign, number of contacts from previous campaign, outcome of previous marketing campaign, employment variation rate, consumer price index, consumer confidence index, number of employees, euribor 3 month rate, and the target variable (y) if the client subscribed a term deposit. The target variable is binary. The dataset consisted of 9 numeric variables, and the rest were categorical variables.


library("readxl")
bank= read_excel("C:/Users/student/Downloads/R/bank_rev.xlsx")
bank$pdays=NULL
bank$emp.var.rate=NULL
bank$euribor3m=NULL
bank$cons.conf.idx=NULL
bank$cons.price.idx=NULL
bank$y=factor(bank$y)
bank$job=factor(bank$job)
bank$marital=factor(bank$marital)
bank$education=factor(bank$education)
bank$default=factor(bank$default)
bank$housing=factor(bank$housing)
bank$loan=factor(bank$loan)
bank$contact=factor(bank$contact)
bank$month=factor(bank$month)
bank$day_of_week=factor(bank$day_of_week)
bank$poutcome=factor(bank$poutcome)
str(bank)


#check imbalanced data
library(caret)
summary(bank$y)


#I noticed that my target variable was imbalanced because there were a lot more people that did not subscribe to a term deposit than people who agreed to subscribe to a term deposit. I addressed this issue by implementing undersampling to balance the data. After performing undersampling, I got a balanced accuracy of roughly 87%.

#undersampling
library(ranger)
library(caret)
splitIndex=createDataPartition(bank$y, p=.70, list=FALSE, times=1)
train_bank=bank[splitIndex,]
test_bank=bank[-splitIndex,]

train1=train_bank[train_bank$y=="yes",]
n1=nrow(train1)
table(train1$y)
 
train0=train_bank[train_bank$y=="no",]
n0=nrow(train0)
table(train0$y)
 
train00=train0[sample(1:n0,n1, replace=TRUE),]
 
train_under=rbind(train00,train1)
 
model_under=ranger(y~.,data=train_under)
pred_under=predict(model_under,data=test_bank)$predictions

cm_under=confusionMatrix(pred_under,test_bank$y,positive="yes")
cm_under$byClass['Balanced Accuracy']

#graphs
library(ggplot2)
ggplot(bank)+geom_bar(mapping = aes(x=y, fill=y), position ="dodge")+labs(title="Submitted term deposit", x="Default", y="Number of Default")
#This shows the number of people who did not submit a term deposit
ggplot(bank)+geom_density(mapping = aes(x=age, color=y), position ="dodge")+labs(title="Default on Loan", x="age", y="Default")
#This shows that more people between the ages of 30-50 did not submit term deposit
ggplot(bank)+geom_bar(mapping = aes(x=marital, fill=y), position ="dodge")+labs(title="marital status vs term deposit", x="marital status", y="Deposit")
#This shows that more married people did not submit a term deposit
ggplot(bank)+geom_bar(mapping = aes(x=day_of_week, fill=y), position ="dodge")+labs(title="day of weekvs term deposit", x="day of week", y="Deposit")
#This shows that it is pretty even of how many people do or do not submit term deposit in a typical day
ggplot(bank)+geom_bar(mapping = aes(x=month, fill=y), position ="dodge")+labs(title="month subscribed", x="month", y="Deposit")
#There are more people who do not submit a tern deposit in May
ggplot(bank)+geom_bar(mapping = aes(x=loan, fill=y), position ="dodge")+labs(title="Default on Loan", x="loan", y="Deposit")
#This shows that most people who did not default on a loan did not sumbit a term deposit, but they would not need to. However there are around 550 people that defaulted on a loan and did not submit a term deposit
ggplot(bank)+geom_bar(mapping = aes(x=housing, fill=y), position ="dodge")+labs(title="housing loan", x="loan", y="Deposit")
#This shows how many people who took out a housing loan did not submit a term deposit
ggplot(bank)+geom_density(mapping = aes(x=duration, color=y), position ="dodge")+labs(title="duration", x="duration (seconds)", y="Deposit")
#The duration in seconds is the amount of time an employee got a hold of the customer over the phone, which shows that more people said yes if the phone call was longer.
ggplot(bank)+geom_density(mapping = aes(x=campaign, fill=y), position ="dodge")+labs(title="Campaign for term deposit", x="number of times contacted", y="Deposit")
#This shows that the more times people were contacted, they said no to subscribing to a term deposit.
ggplot(bank)+geom_density(mapping = aes(x=nr.employed, fill=y), position ="dodge")+labs(title="number of employees", x="number of employees", y="Deposit")
#This shows that when there were less employees working, more people said yes to a term deposit
ggplot(bank)+geom_bar(mapping = aes(x=education, fill=y), position ="dodge")+labs(title="education", x="education", y="Deposit")
#There were more people who said no to a term deposit that had a university degree.
ggplot(bank)+geom_density(mapping = aes(x=previous, color=y), position ="dodge")+labs(title="previous outcome", x="loan", y="Deposit")
#People were more likely to say yes to a term deposit if they were contacted earlier
ggplot(bank)+geom_bar(mapping = aes(x=poutcome, fill=y), position ="dodge")+labs(title="previous outcome", x="loan", y="Deposit")
#There are more people who say yes to a term deposit if they were not contacted before

#To impute missing values, I imputed numeric by mean and categorical by mode. I then ran four models: decision tree, random forest, glm, and glmnet to see which one provided a better model.

#impute by mean
misshandled=function(data){
  for(i in 1:ncol(data)){
    if (is.numeric(data[,i])){
      data[,i][is.na(data[,i])]=mean(data[,i], na.rm=TRUE)
    } else{
      levels=unique(data[,i])
      data[,i][is.na(data[,i])]=levels[which.max(tabulate(match(data[,i],levels)))]
    }
  }
  return(data)
}
bank1=misshandled(bank)
sum(is.na(bank1))

#model
set.seed(200)
splitIndex<- createDataPartition(bank1$y, p=.7, list=FALSE, times = 1)
train<- bank1[splitIndex,]
test<- bank1[-splitIndex,]

#decision tree model

model <- train(y~.,data =train, method = "rpart")
pred=predict(model,test)
levels(test$y) = c("1", "0")
levels(pred) = c("1", "0")
cm=confusionMatrix(pred, test$y, positive="1")
cm

  
#random forest
  
modelrf = ranger(y ~., data = train)
pred1  = predict(modelrf, data = test)$predictions
levels(test$y) = c("0", "1")
levels(pred1) = c("0", "1")
cm1=confusionMatrix(pred1, test$y, positive="1")
cm1

  
#glm
  library(glmnet)
modelglm <- train(
  y~.,
  data = bank, method = "glm",
  trControl = trainControl(method = "cv", number = 7, verboseIter = TRUE))
pred2=predict(modelglm,test)
levels(test$y) = c("0", "1")
levels(pred2) = c("0", "1")
cm2=confusionMatrix(pred2, test$y, positive="1")
cm2

#glmnet

modelglmnet <- train(
  y~.,
  data = bank, method = "glmnet",
  trControl = trainControl(method = "cv", number = 7, verboseIter = TRUE))
pred3=predict(modelglmnet,test)
levels(test$y) = c("0", "1")
levels(pred3) = c("0", "1")
cm3=confusionMatrix(pred3, test$y, positive="1")
cm3

bank[bank=="unknown"]=NA
sum(is.na(bank))

#Another way I tried to impute missing variables was by excluding the missing data. By doing so, I found that the models gave similar results, but generally imputing by mean and mode gave better models.

#exclude missing data
bank3=na.exclude(bank)
sum(is.na(bank3))

#model
set.seed(200)
splitIndex<- createDataPartition(bank3$y, p=.7, list=FALSE, times = 1)
train<- bank3[splitIndex,]
test<- bank3[-splitIndex,]

#decision tree model

model <- train(y~.,data =train, method = "rpart")
pred=predict(model,test)
levels(test$y) = c("1", "0")
levels(pred) = c("1", "0")
cm=confusionMatrix(pred, test$y, positive="1")
cm

  
#random forest
  
modelrf = ranger(y ~., data = train)
pred1  = predict(modelrf, data = test)$predictions
levels(test$y) = c("0", "1")
levels(pred1) = c("0", "1")
cm1=confusionMatrix(pred1, test$y, positive="1")
cm1

  
#glm
  library(glmnet)
modelglm <- train(
  y~.,
  data = train, method = "glm",
  trControl = trainControl(method = "cv", number = 7, verboseIter = TRUE))
pred2=predict(modelglm,test)
levels(bank$y) = c("0", "1")
levels(pred2) = c("0", "1")
cm2=confusionMatrix(pred2, test$y, positive="1")
cm2

#glmnet

modelglmnet <- train(
  y~.,
  data = train, method = "glmnet",
  trControl = trainControl(method = "cv", number = 7, verboseIter = TRUE))
pred3=predict(modelglmnet,test)
levels(test$y) = c("0", "1")
levels(pred3) = c("0", "1")
cm3=confusionMatrix(pred3, test$y, positive="1")
cm3

#I also recoded categorical variables that had categories greater than 10. I started by categorizing months by seasons. I then categorized job into low class paying jobs, high class paying jobs, and unemployed. I also categorized education to before high school, high school degree, unviversity degree, and unknown. After recoding these variables, I ran the models again, and I found that it improved the glm and glmnet models by 1% in accuracy.

#recoding categorical variables
bank4=bank3
library(car)
Spring.set=c("mar","apr","may")
Summer.set=c("jun","jul","aug")
Autumn.set=c("sep","oct","nov")
Winter.set=c("dec")
bank4$month=recode(bank4$month,"Spring.set='Spring'; Summer.set='Summer'; Autumn.set='Autumn'; Winter.set='Winter'")
levels(bank4$job)=c("low class", "low class", "high class", "low class", "high class", "unemployed", "low class", "low class", "low class", "low class", "unemployed", "unknown")
levels(bank4$education)=c("before high school","before high school", "before high school", "high school", "before high school", "unknown", "college", "unknown")

#model
set.seed(200)
splitIndex<- createDataPartition(bank4$y, p=.7, list=FALSE, times = 1)
train<- bank4[splitIndex,]
test<- bank4[-splitIndex,]

#decision tree model

model <- train(y~.,data =train, method = "rpart")
pred=predict(model,test)
levels(test$y) = c("1", "0")
levels(pred) = c("1", "0")
cm=confusionMatrix(pred, test$y, positive="1")
cm

  
#random forest
  
modelrf = ranger(y ~., data = train)
pred1  = predict(modelrf, data = test)$predictions
levels(test$y) = c("0", "1")
levels(pred1) = c("0", "1")
cm1=confusionMatrix(pred1, test$y, positive="1")
cm1

  
#glm
  library(glmnet)
modelglm <- train(
  y~.,
  data = train, method = "glm",
  trControl = trainControl(method = "cv", number = 7, verboseIter = TRUE))
pred2=predict(modelglm,test)
levels(test$y) = c("0", "1")
levels(pred2) = c("0", "1")
cm2=confusionMatrix(pred2, test$y, positive="1")
cm2

#glmnet

modelglmnet <- train(
  y~.,
  data = train, method = "glmnet",
  trControl = trainControl(method = "cv", number = 7, verboseIter = TRUE))
pred3=predict(modelglmnet,test)
levels(bank$y) = c("0", "1")
levels(pred3) = c("0", "1")
cm3=confusionMatrix(pred3, test$y, positive="1")
cm3

#I then tried encoding categorical variables by dummy coding. After using this method, my models did not improve.

#encoding
bank5=bank3
preProcess_missingdata_model <- preProcess(bank, method='medianImpute')
trainData <- predict(preProcess_missingdata_model, newdata =bank5)
dummies_model <- dummyVars(y ~ ., data=train)
trainData_mat <- predict(dummies_model, newdata = train)

#model
set.seed(200)
splitIndex<- createDataPartition(bank5$y, p=.7, list=FALSE, times = 1)
train<- bank5[splitIndex,]
test<- bank5[-splitIndex,]

#decision tree model

model <- train(y~.,data =train, method = "rpart")
pred=predict(model,test)
levels(test$y) = c("1", "0")
levels(pred) = c("1", "0")
cm=confusionMatrix(pred, test$y, positive="1")
cm

  
#random forest
  
modelrf = ranger(y ~., data = train)
pred1  = predict(modelrf, data = test)$predictions
levels(test$y) = c("0", "1")
levels(pred1) = c("0", "1")
cm1=confusionMatrix(pred1, test$y, positive="1")
cm1

  
#glm
  library(glmnet)
modelglm <- train(
  y~.,
  data = train, method = "glm",
  trControl = trainControl(method = "cv", number = 7, verboseIter = TRUE))
pred2=predict(modelglm,test)
levels(bank$y) = c("0", "1")
levels(pred2) = c("0", "1")
cm2=confusionMatrix(pred2, test$y, positive="1")
cm2

#glmnet

modelglmnet <- train(
  y~.,
  data = train, method = "glmnet",
  trControl = trainControl(method = "cv", number = 7, verboseIter = TRUE))
pred3=predict(modelglmnet,test)
levels(test$y) = c("0", "1")
levels(pred3) = c("0", "1")
cm3=confusionMatrix(pred3, test$y, positive="1")
cm3

#After running all the models, I found that the best model was glmnet after imputing numeric variables by mean and categorical variables by mode. I wanted to see if I could improve my best model by tuning, so I tuned the glmnet to see if it would give me a better result, however it did not improve the model. In conclusion, the best predicting model is the glmnet after imputing numeric variables by mean and categorical variables by mode. 

#glmnet tuned
myGrid = expand.grid(alpha = 0:1,lambda = seq(0.0001,1,length = 10))

modelglmnettun <- train(
  y~.,
  tuneLength = 1,
  data = train, method = "glmnet",
  trControl = trainControl(method = "cv", number = 4, verboseIter = TRUE),tuneGrid=myGrid)
pred=predict(modelglmnettun,test)
levels(test$y) = c("0", "1")
levels(pred) = c("0", "1")
cm=confusionMatrix(pred, test$y, positive="1")
cm




Classes 'tbl_df', 'tbl' and 'data.frame':	4119 obs. of  16 variables:
 $ age        : num  30 39 25 38 47 32 32 41 31 35 ...
 $ job        : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
 $ marital    : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
 $ education  : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
 $ default    : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 1 1 2 1 2 ...
 $ housing    : Factor w/ 3 levels "no","unknown",..: 3 1 3 2 3 1 3 3 1 1 ...
 $ loan       : Factor w/ 3 levels "no","unknown",..: 1 1 1 2 1 1 1 1 1 1 ...
 $ contact    : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
 $ month      : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
 $ day_of_week: Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
 $ duration   : num  487 346 227 17 58 128 290 44 68 170 ...
 $ campaign   : num  2 4 1 3 1 3 4 2 1 1 ...
 $ previous   : num  0 0 0 0 0 

Loading required package: lattice
Loading required package: ggplot2



 no yes 
  0 316 


  no  yes 
2568    0 

"Width not defined. Set with `position_dodge(width = ?)`"