Permalink
Branch: master
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
222 lines (160 sloc) 6.79 KB
########################################################################
###### Data Pre-processing
########################################################################
library(RCurl)
# Read CSV
data <- read.csv(text = getURL("https://raw.githubusercontent.com/joshkyh/amss/master/ObservedData.csv"))
data <- data[, c('revenue', 'tv.spend', 'search.spend')]
# Simplify column names
names(data) <- c('revenue', 'tv', 'search')
# Created lagged variables
library(Hmisc)
data$revenue.lag1y <- Lag(data$revenue, 52)
data$revenue.lag3m <- Lag(data$revenue, 13)
data$revenue.lag1w <- Lag(data$revenue, 1)
# Create moving average
library(zoo)
data$tv_lag1w <- Lag(data$tv, 1)
data$search_lag1w <- Lag(data$search, 1)
data$tv_p3m <- rollmean(data$tv, k=13, fill=NA, align = "right")
data$search_p3m <- rollmean(data$search, k=13, fill= NA, align = "right")
data$tv_p1y <- rollmean(data$tv, k=52, fill=NA, align = "right")
data$search_p1y <- rollmean(data$search, k=52, fill= NA, align = "right")
# Drop first year data
data <- data[53:nrow(data),]
########################################################################
###### Check for collinearity and multicollinearity
########################################################################
library(corrplot)
corrplot(cor(data), method = "number") # Display the correlation coefficient
# From correlation matrix analysis, we remove revenue.lag1w
data <- data[ , !names(data) %in% c("revenue.lag1w")]
library(car)
lm.fit <- lm(revenue ~ ., data = data)
vif(lm.fit)
# From vif analysis, we remove search_p3m
data <- data[ , !names(data) %in% c("search_p3m")]
########################################################################
###### Cross Validation
########################################################################
library(BAS)
library(Metrics)
library(glmnet)
# The following df is to contain the RMSE for each row in the leave one out cross validation
rmse_df <- data.frame(bma = rep(0, NROW(data))
, hpm = rep(0, NROW(data))
, mpm = rep(0, NROW(data))
, lasso = rep(0, NROW(data)))
# The following is to prepare the regularization terms for lasso, and to prepare
# the input matrix for lasso
grid = 10^seq(10,-2, length=100)
x <- model.matrix(revenue ~ ., data)[,-1]
y <- data$revenue
# The for loop iterates through each row of the input data frame.
# Each row would take turn to become a one-row test set
# The rest of the rows would be used to fit the Bayesian and Lasso models
# The RMSE of the test set is recorded in rmse_df
for(i in c(1:NROW(data))) {
print(i)
train <- data[-i, ]
test <- data[i,]
bas.fit = bas.lm(revenue ~ .
, data = train
, prior = "ZS-null"
, modelprior = uniform()
, method = "deterministic")
rmse_df$bma[i] <- rmse(predict(bas.fit, newdata = test, estimator = 'BMA')$fit, test$revenue)
rmse_df$hpm[i] <- rmse(predict(bas.fit, newdata = test, estimator = 'HPM')$fit, test$revenue)
rmse_df$mpm[i] <- rmse(predict(bas.fit, newdata = test, estimator = 'MPM')$fit, test$revenue)
cv.out = cv.glmnet(x[-i,], y[-i], alpha = 1, lambda = grid)
bestlam = cv.out$lambda.min
lasso.mod = glmnet(x[-i,], y[-i], alpha = 1, lambda = grid)
lasso.pred = predict(lasso.mod, newx = matrix((x[i,]), ncol=9), s=bestlam, type = "response")
rmse_df$lasso[i] <- rmse(lasso.pred, y[i])
}
# T-test in the difference in performance
# Create bar charts with confidence intervals
mean(rmse_df$bma)
mean(rmse_df$hpm)
mean(rmse_df$mpm)
mean(rmse_df$lasso)
mean(data$revenue)
library(reshape2)
rmse_melt <-melt (rmse_df, measure = 1:4)
rmse_melt$value = rmse_melt$value / 10^6
tgc <- summarySE(rmse_melt, measurevar="value", groupvars=c("variable"))
tgc2 <- tgc
tgc2$variable <- factor(tgc2$variable)
ggplot(tgc2, aes(x=variable, y=value)) +
geom_bar(position=position_dodge(), stat="identity",
colour="black", # Use black outlines,
size=.3) + # Thinner lines
geom_errorbar(aes(ymin=value-se, ymax=value+se),
size=.3, # Thinner lines
width=.2,
position=position_dodge(.9)) +
xlab("Method") +
ylab("RMSE (millions)") +
ggtitle("Standard Error of RMSE") +
theme_bw()
t.test(x = rmse_df$mpm, y = rmse_df$lasso)
bas.fit = bas.lm(revenue ~ .
, data = data
, prior = "ZS-null"
, modelprior = uniform()
, method = "deterministic")
View(cbind.data.frame(
variables = c("intercept", names(data)[2:10])
, BMA = coef(bas.fit, estimator = "BMA")$postmean
, HPM = coef(bas.fit, estimator = "HPM")$postmean
, MPM = coef(bas.fit, estimator = "MPM")$postmean
))
cv.out = cv.glmnet(x, y, alpha = 1, lambda = grid)
bestlam = cv.out$lambda.min
lasso.mod = glmnet(x, y, alpha = 1, lambda = bestlam)
predict(lasso.mod, lambda = bestlam, type="coefficients")
summary(bas.fit)
plot(bas.fit, which =1, add.smooth = F)
plot(bas.fit, which = 2, add.smooth = F)
abline(h=1)
plot(bas.fit, which=3)
plot(bas.fit, which=4)
image(bas.fit, rotate =F)
coef.ZS = coef(bas.fit)
plot(coef.ZS)
## SummarySE from http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/#Helper%20functions
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
## data: a data frame.
## measurevar: the name of a column that contains the variable to be summariezed
## groupvars: a vector containing names of columns that contain grouping variables
## na.rm: a boolean that indicates whether to ignore NA's
## conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
conf.interval=.95, .drop=TRUE) {
library(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop=.drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm=na.rm),
mean = mean (xx[[col]], na.rm=na.rm),
sd = sd (xx[[col]], na.rm=na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}