Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
521 lines (439 sloc) 21.585 kb
### Kaggle Competition: Allstate Purchase Prediction Challenge
### Code by Kevin Markham
### https://github.com/justmarkham/kaggle-allstate
# Notes about this file:
# 1. Most of the code is explained in my paper:
# https://github.com/justmarkham/kaggle-allstate/blob/master/allstate-paper.md
# 2. For the sake of brevity, a lot of my exploratory code is not included.
# 3. Many of the variables are poorly named, my apologies.
# 4. The code assumes that "train.csv" and "test_v2.csv" are in your working
# directory.
# 5. To generate my best set of predictions, simply load the required libraries
# and then run the "Loading Data" section and "Model Building Part 10". All
# other code is irrelevant.
# Load required libraries
library(dplyr)
library(caret)
library(ROCR)
library(randomForest)
library(nnet)
library(ggplot2)
## LOADING DATA AND ADDING FEATURES ##
# Define column classes for reading in the data sets
colClasses <- c(rep("integer", 4), "character", rep("factor", 2),
"integer", "factor", "integer", "factor", rep("integer", 3),
rep("factor", 2), "integer", rep("factor", 7), "integer")
# Function for pre-processing and feature engineering
preprocess <- function(data) {
# add features
data$plan <- paste0(data$A, data$B, data$C, data$D, data$E, data$F, data$G)
data$hour <- as.integer(substr(data$time, 1, 2))
data$timeofday <- as.factor(ifelse(data$hour %in% 6:15, "day",
ifelse(data$hour %in% 16:18, "evening",
"night")))
data$weekend <- as.factor(ifelse(data$day %in% 0:4, "No", "Yes"))
data$family <- as.factor(ifelse(data$group_size > 2 & data$age_youngest <
25 & data$married_couple==1, "Yes", "No"))
data$agediff <- data$age_oldest-data$age_youngest
data$individual <- as.factor(ifelse(data$agediff==0 & data$group_size==1,
"Yes", "No"))
data$stategroup <- as.factor(ifelse(data$state %in% c("SD","ND"), "g1",
ifelse(data$state %in% c("AL","WY"), "g2",
ifelse(data$state %in% c("OK","ME","AR",
"WV"), "g3",
ifelse(data$state %in% c("DC","NE","GA"),
"g5", "g4")))))
# fix NA's for duration_previous and C_previous
data$duration_previous[is.na(data$duration_previous)] <- 0
levels(data$C_previous) <- c("1", "2", "3", "4", "none")
data$C_previous[is.na(data$C_previous)] <- "none"
# replace risk_factor NA's by predicting a value
datanorisk <- data[is.na(data$risk_factor), ]
datarisk <- data[!is.na(data$risk_factor), ]
lm.fit <- lm(risk_factor ~ age_youngest*group_size+married_couple+
homeowner, data=datarisk)
lm.pred <- predict(lm.fit, newdata=datanorisk)
data$risk_factor[is.na(data$risk_factor)] <- round(lm.pred, 0)
# for car_age greater than 30, "round down" to 30
data$car_age[data$car_age > 30] <- 30
return(data)
}
# read in training set
train <- read.csv("train.csv", colClasses=colClasses)
train <- preprocess(train)
# trainsub is subset of train that only includes purchases
trainsub <- train[!duplicated(train$customer_ID, fromLast=TRUE), ]
# trainex is subset of train that excludes purchases
trainex <- train[duplicated(train$customer_ID, fromLast=TRUE), ]
# trainex2 only includes last quote before purchase
trainex2 <- trainex[!duplicated(trainex$customer_ID, fromLast=TRUE), ]
# changed is anyone who changed from their last quote
changed <- ifelse(trainsub$plan == trainex2$plan, "No", "Yes")
changelog <- ifelse(trainsub$plan == trainex2$plan, FALSE, TRUE)
trainsub$changed <- as.factor(changed)
trainex2$changed <- as.factor(changed)
trainsub$changelog <- changelog
trainex2$changelog <- changelog
# compute "stability" feature from trainex and add to trainex2
customerstability <- trainex %.% group_by(customer_ID) %.%
summarise(quotes=n(), uniqueplans=n_distinct(plan),
stability=(quotes-uniqueplans+1)/quotes)
trainex2$stability <- customerstability$stability
# compute "planfreq" feature on trainex2
nrowtrainex2 <- nrow(trainex2)
planfreqs <- trainex2 %.% group_by(plan) %.%
summarise(planfreq=n()/nrowtrainex2)
trainex2 <- left_join(trainex2, planfreqs)
# trainex3 is identical to trainex2 but also includes purchases
trainex3 <- cbind(trainex2, Apurch=trainsub$A, Bpurch=trainsub$B,
Cpurch=trainsub$C, Dpurch=trainsub$D, Epurch=trainsub$E, Fpurch=trainsub$F,
Gpurch=trainsub$G, planpurch=trainsub$plan, stringsAsFactors=FALSE)
# read in test set
test <- read.csv("test_v2.csv", colClasses=colClasses)
test <- preprocess(test)
# fix locations that are NA
s <- split(test$location, test$state)
s2 <- sapply(s, function(x) x[1])
NAstates <- test[is.na(test$location), "state"]
NAlocations <- s2[NAstates]
test$location[is.na(test$location)] <- NAlocations
# add "changed" variable and default to No
test$changed <- factor(rep("No", nrow(test)), levels=c("No", "Yes"))
# testsub only shows last (known) quote before purchase
testsub <- test[!duplicated(test$customer_ID, fromLast=TRUE), ]
# compute "stability" feature from test and add to testsub
customerstability <- test %.% group_by(customer_ID) %.% summarise(quotes=n(),
uniqueplans=n_distinct(plan), stability=(quotes-uniqueplans+1)/quotes)
testsub$stability <- customerstability$stability
# compute "planfreq" feature on testsub
nrowtestsub <- nrow(testsub)
planfreqs <- testsub %.% group_by(plan) %.% summarise(planfreq=n()/nrowtestsub)
testsub <- left_join(testsub, planfreqs)
## DATA EXPLORATION ##
# check for NA values
sapply(train, function(x) mean(is.na(x)))
# risk_factor, C_previous, duration_previous
sapply(test, function(x) mean(is.na(x)))
# risk_factor, C_previous, duration_previous, location
uniquetrainplan <- unique(train$plan)
uniquetestplan <- unique(test$plan)
# plans in train: 1809
# plans in test: 1596
# union: 1878 (69 plans in test that are not in train)
# intersection: 1527
## VISUALIZATIONS ##
# Viz 1: Number of shopping points
shoptrain <- data.frame(maxpoint=trainex2$shopping_pt, dataset=rep("train",
nrow(trainex2)))
shoptest <- data.frame(maxpoint=testsub$shopping_pt, dataset=rep("test",
nrow(testsub)))
shoppingpoints <- rbind(shoptrain, shoptest)
shoppingpoints$dataset <- as.factor(shoppingpoints$dataset)
ggplot(shoppingpoints) + aes(factor(maxpoint)) + geom_histogram() +
facet_grid(dataset ~ .) + labs(x="Number of Shopping Points",
y="Frequency", title="Comparing Number of Shopping Points in
Training vs Test Sets")
ggsave("allstate-viz-1.png")
# Viz 2: Predictive power of final quote before purchase
s <- split(trainex2, trainex2$shopping_pt)
s2 <- sapply(s, function(x) sum(x$changed=="No")/nrow(x))
s2b <- sapply(s, nrow)
acclastentry <- data.frame(numshoppoints=as.integer(names(s2)), accuracy=s2,
Observations=s2b)
ggplot(acclastentry) + aes(numshoppoints, accuracy, size=Observations) +
geom_point() + geom_line(size=0.5) + scale_x_continuous(breaks=1:12) +
theme(panel.grid.minor=element_blank()) + labs(x="Number of Shopping
Points", y="Prediction Accuracy", title="Effect of Number of Shopping
Points on Predictive Power of Last Quote")
ggsave("allstate-viz-2.png")
# Viz 3: Effect of purchase hour on the predictive power of the final quote
s3 <- split(trainsub, trainsub$hour)
s4 <- sapply(s3, function(x) sum(x$changed=="Yes")/nrow(x))
s5 <- as.data.frame(table(trainsub$hour))$Freq
changebyhour <- data.frame(hour=as.integer(names(s4)),
percentchanged=s4, count=s5)
ggplot(changebyhour) + aes(hour, percentchanged, color=count) +
geom_point(size=4) + labs(x="Hour of Purchase", y="Percent Changed",
title="Effect of Purchase Hour on Likelihood of Changing from Last Quote")
ggsave("allstate-viz-3.png")
# Viz 4: Dependencies between options
C_names <- list("1"="C=1", "2"="C=2", "3"="C=3", "4"="C=4")
C_labeller <- function(variable, value){ return(C_names[value]) }
ggplot(trainsub, aes(D)) + geom_bar() + facet_grid(. ~ C,
labeller=C_labeller) + labs(x="Customer Selection of Option D (1, 2, 3)",
y="Frequency", title="Customer selection of Option D based on their
selection for Option C")
ggsave("allstate-viz-4.png")
# Viz 5: Clustering of states
# based on: http://is-r.tumblr.com/post/37708137014/us-state-maps-using-map-data
states <- map_data("state")
states$grp <- as.factor(ifelse(states$region %in% c("south dakota",
"north dakota"), "1 (least likely)",
ifelse(states$region %in% c("alabama","wyoming"), "2",
ifelse(states$region %in% c("oklahoma","maine","arkansas","west virginia"),
"3",
ifelse(states$region %in% c("colorado","connecticut","delaware","florida",
"iowa","idaho","indiana","kansas","kentucky","maryland","missouri",
"mississippi","montana","new hampshire","new mexico","nevada",
"new york","ohio","oregon","pennsylvania","rhode island","tennessee",
"utah","washington","wisconsin"), "4",
ifelse(states$region %in% c("district of columbia","nebraska","georgia"),
"5 (most likely)", "unassigned"))))))
ggplot(states) + aes(x=long, y=lat, group=group, fill=grp) +
geom_polygon(color="black") + theme_bw() +
theme(panel.border=element_blank()) + scale_y_continuous(breaks=c()) +
scale_x_continuous(breaks=c()) + labs(title="Clustering of States
Based on Customer Likelihood of Changing from Last Quote", fill="Cluster",
x="", y="") + scale_fill_brewer(palette="Pastel1")
ggsave("allstate-viz-5.png")
## MODEL BUILDING ##
## Note: Each "PART" is explained in the paper
# PART 1:
# Submit the baseline
pred <- data.frame(customer_ID = testsub$customer_ID, plan = testsub$plan)
write.csv(pred, file="submit1.csv", row.names=FALSE, quote=FALSE) # 0.53793
# PART 2:
# Rule-based predictions
# Example: if C=4, then change D to 3
testsub$D[testsub$C==4] <- 3
filter(testsub, C==4, D!=3)
pred <- data.frame(customer_ID = testsub$customer_ID,
plan = paste0(testsub$A, testsub$B, testsub$C, testsub$D,
testsub$E, testsub$F, testsub$G))
write.csv(pred, file="submit2.csv", row.names=FALSE, quote=FALSE) # 0.53769
# PART 4:
# logistic regression for predicting "changed"
glm.fit <- glm(changed ~ state+cost+A+C+D+E+F+G+age_oldest+age_youngest+
car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+C_previous+
duration_previous+stability+planfreq, data=trainex2, family=binomial)
summary(glm.fit)
glm.probs <- predict(glm.fit, type="response")
glm.pred <- ifelse(glm.probs>0.5, "Yes", "No")
confusionMatrix(glm.pred, trainex2$changed, "Yes")
predob <- prediction(glm.probs, trainex2$changed)
acc <- performance(predob, "acc"); plot(acc)
prec <- performance(predob, "prec"); plot(prec)
roc <- performance(predob, "tpr", "fpr"); plot(roc)
# 5-fold CV for logistic regression
set.seed(5)
folds <- sample(rep(1:5, length = nrow(trainex2)))
for(k in 1:5) {
fit <- glm(changed ~ state+cost+A+C+D+E+F+G+age_oldest+age_youngest+
car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+C_previous+
duration_previous+stability, data=trainex2[folds!=k, ],
family=binomial)
probs <- predict(fit, newdata=trainex2[folds==k, ], type="response")
pred <- ifelse(probs>0.5, "Yes", "No")
print(mean(pred==trainex2$changed[folds==k]))
}
# random forests for predicting "changed"
rf.fit <- randomForest(changed ~ stategroup+cost+A+C+D+E+F+G+age_oldest+
age_youngest+car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+
C_previous+duration_previous+stability+planfreq, data=trainex2, mtry=5)
rf.pred <- ifelse(rf.fit$votes[, 2]>0.5, "Yes", "No")
confusionMatrix(rf.pred, trainex2$changed, "Yes")
# PART 5:
# 5-fold CV for logistic regression - for precision at 0.85 cutoff
set.seed(5)
folds <- sample(rep(1:5, length = nrow(trainex2)))
predyestotal <- 0
tptotal <- 0
for(k in 1:5) {
fit <- glm(changed ~ state+cost+A+C+D+E+F+G+age_oldest+age_youngest+
car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+C_previous+
duration_previous+stability+planfreq, data=trainex2[folds!=k, ],
family=binomial)
probs <- predict(fit, newdata=trainex2[folds==k, ], type="response")
pred <- as.factor(ifelse(probs>0.85, "Yes", "No"))
predyes <- sum(pred=="Yes")
predyestotal <- predyestotal + predyes
actual <- trainex2$changed[folds==k]
actualwhenpredyes <- actual[pred=="Yes"]
tp <- sum(actualwhenpredyes=="Yes")
tptotal <- tptotal + tp
}
print(tptotal)
print(predyestotal)
print(tptotal/predyestotal)
# train model on trainex2 and predict changed on testsub
glm.fit <- glm(changed ~ state+cost+A+C+D+E+F+G+age_oldest+age_youngest+
car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+C_previous+
duration_previous+stability, data=trainex2, family=binomial)
glm.probs <- predict(glm.fit, newdata=testsub, type="response")
glm.pred <- ifelse(glm.probs>0.85, "Yes", "No")
# update "changed" variable on testsub to reflect prediction
testsub$changed <- as.factor(glm.pred)
# for records predicting changed, planpred = 9999999
testsub$planpred <- ifelse(testsub$changed=="Yes", "9999999", testsub$plan)
pred <- data.frame(customer_ID = testsub$customer_ID, plan = testsub$planpred)
write.csv(pred, file="submit6.csv", row.names=FALSE, quote=FALSE)
# 52 changes, 16 affect leaderboard
# 0.53769 means 4 below baseline
# PART 6:
# 5-fold CV for random forests - for precision at 0.85 cutoff
set.seed(5)
folds <- sample(rep(1:5, length = nrow(trainex2)))
predyestotal <- 0
tptotal <- 0
for(k in 1:5) {
fit <- randomForest(changed ~ stategroup+cost+A+C+D+E+F+G+age_oldest+
age_youngest+car_value+car_age+shopping_pt+timeofday+weekend+
risk_factor+C_previous+duration_previous+stability+planfreq,
data=trainex2[folds!=k, ], mtry=5)
votes <- predict(fit, newdata=trainex2[folds==k, ], type="vote")
pred <- as.factor(ifelse(votes[, 2]>0.85, "Yes", "No"))
predyes <- sum(pred=="Yes")
predyestotal <- predyestotal + predyes
actual <- trainex2$changed[folds==k]
actualwhenpredyes <- actual[pred=="Yes"]
tp <- sum(actualwhenpredyes=="Yes")
tptotal <- tptotal + tp
}
print(tptotal)
print(predyestotal)
print(tptotal/predyestotal)
# PART 7 and PART 8:
# multinom to predict A (repeat for each option)
a.mn.fit <- multinom(A ~ .-customer_ID-record_type-day-time-location-plan-
hour-agediff-stategroup, data=trainex2)
a.mn.pred <- predict(a.mn.fit, newdata=trainex2, type="class")
confusionMatrix(a.mn.pred, trainsub$A)
# multinom to predict Apurch (repeat for each option)
a.mn.fit <- multinom(Apurch ~ .-customer_ID-record_type-day-time-location-
plan-hour-agediff-stategroup-Bpurch-Cpurch-Dpurch-Epurch-Fpurch-Gpurch-
planpurch, data=trainex3)
a.mn.pred <- predict(a.mn.fit, newdata=trainex3, type="class")
mean(a.mn.pred==trainex3$Apurch)
mean(a.mn.pred==trainex3$A)
# random forests to predict A (repeat for each option)
a.rf.fit <- randomForest(A ~ .-customer_ID-record_type-day-time-location-
plan-hour-agediff-state, data=trainex2, importance=FALSE)
mean(a.rf.fit$predicted==trainex2$A)
mean(a.rf.fit$predicted==trainsub$A)
# random forests to predict Apurch (repeat for each option)
a.rf.fit <- randomForest(Apurch ~ .-customer_ID-record_type-day-time-location-
plan-hour-agediff-state-Bpurch-Cpurch-Dpurch-Epurch-Fpurch-Gpurch-
planpurch, data=trainex3, importance=FALSE)
mean(a.rf.fit$predicted==trainex3$Apurch)
mean(a.rf.fit$predicted==trainex3$A)
# random forests to predict Apurch, trained only on subset for which change is
# predicted (repeat for each option)
trainex4 <- trainex3[trainex3$changed=="Yes", ]
a.rf.fit <- randomForest(Apurch ~ .-customer_ID-record_type-day-time-location-
plan-hour-agediff-state-Bpurch-Cpurch-Dpurch-Epurch-Fpurch-Gpurch-
planpurch, data=trainex4, importance=FALSE)
mean(a.rf.fit$predicted==trainex4$Apurch)
mean(a.rf.fit$predicted==trainex4$A)
# combine option predictions for A through G
pred.plan <- paste0(a.rf.fit$predicted, b.rf.fit$predicted, c.rf.fit$predicted,
d.rf.fit$predicted, e.rf.fit$predicted, f.rf.fit$predicted,
g.rf.fit$predicted)
# for people I predict changed in train, did I get it right?
predchange.ix <- which(glm.pred=="Yes")
predchange.cid <- trainex2[predchange.ix, "customer_ID"]
for (i in 1:length(predchange.cid)) {
# customer_ID
print(predchange.cid[i])
# final quote
print(trainex2[trainex2$customer_ID==predchange.cid[i], "plan"])
# predicted purchase
print(pred.plan[predchange.ix[i]])
# actual purchase
print(trainsub[trainsub$customer_ID==predchange.cid[i], "plan"])
}
# PART 9:
# train model on trainex2 and predict changed on testsub
glm.fit <- glm(changed ~ state+cost+A+C+D+E+F+G+age_oldest+age_youngest+
car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+C_previous+
duration_previous+stability+planfreq, data=trainex2, family=binomial)
glm.probs <- predict(glm.fit, newdata=testsub, type="response")
glm.pred <- ifelse(glm.probs>0.9, "Yes", "No")
# update changed variable on testsub to reflect prediction
testsub$changed <- as.factor(glm.pred)
# for records predicting changed, start by predicting plan, then modify each one
testsub$planpred <- testsub$plan
testsub$planpred[testsub$customer_ID=="10006040"] <- "1033021"
testsub$planpred[testsub$customer_ID=="10011049"] <- "1133112"
testsub$planpred[testsub$customer_ID=="10026297"] <- "0011002"
testsub$planpred[testsub$customer_ID=="10027789"] <- "0012002"
testsub$planpred[testsub$customer_ID=="10054016"] <- "0011002"
testsub$planpred[testsub$customer_ID=="10054825"] <- "0011001"
testsub$planpred[testsub$customer_ID=="10068571"] <- "0011002"
testsub$planpred[testsub$customer_ID=="10113734"] <- "2022032"
testsub$planpred[testsub$customer_ID=="10116125"] <- "0012002"
# submit
pred <- data.frame(customer_ID = testsub$customer_ID, plan = testsub$planpred)
write.csv(pred, file="submit11.csv", row.names=FALSE, quote=FALSE)
# 9 changes, 3 affect leaderboard
# 0.53793 means 0 below baseline
# PART 10:
# calculate change likelihood of plans
rec <- train %.% group_by(plan) %.% summarise(planpur=mean(record_type),
plancnt=n()) %.% arrange(planpur, desc(plancnt))
# function to "fix" plans based upon thresholds
fixplans <- function(planpurmax, plancntmin, commonmin) {
# make list of fixes
rectop <- rec[rec$planpur<=planpurmax & rec$plancnt>=plancntmin, "plan"]
rectopbest <- vector(mode="character", length=length(rectop))
rectopcommon <- vector(mode="numeric", length=length(rectop))
for (i in 1:length(rectop)) {
# vector of unique customers that looked at that plan
cust <- unique(train[train$plan==rectop[i], "customer_ID"])
# what are all the plans that those customers purchased?
purplan <- train[train$customer_ID %in% cust & train$record_type==1,
"plan"]
# what was the most common purchased?
rectopbest[i] <- names(sort(table(purplan), decreasing=TRUE))[1]
# how common was it?
rectopcommon[i] <- sort(table(purplan),
decreasing=TRUE)[1]/length(purplan)
}
fixes <- data.frame(old=rectop[rectopcommon>=commonmin],
new=rectopbest[rectopcommon>=commonmin], stringsAsFactors=FALSE)
fixes <- fixes[fixes$old!=fixes$new, ]
print(nrow(fixes))
print(fixes)
# reset testsub, and check how many fixes will be made
testsub <- test[!duplicated(test$customer_ID, fromLast=TRUE), ]
testsub$planpred <- testsub$plan
print(sum(testsub$planpred %in% fixes$old))
# make fixes
nrowtestsub <- nrow(testsub)
for (i in 1:nrowtestsub) {
if (testsub$planpred[i] %in% fixes$old) {
testsub$planpred[i] <-
fixes$new[which(fixes$old==testsub$planpred[i])]
}
}
return(testsub)
}
# submission that had my best PUBLIC leaderboard score
# 5 plans, 322 fixes
# public score: 0.53853, above baseline by 10 picks
# private score: 0.53266, below baseline by 1 pick
ts <- fixplans(0.05, 70, 0.05)
pred <- data.frame(customer_ID = ts$customer_ID, plan = ts$planpred)
write.csv(pred, file="submit34.csv", row.names=FALSE, quote=FALSE)
# submission that had my best PRIVATE leaderboard score
# 2 plans, 305 fixes
# public score: 0.53847, above baseline by 9 picks
# private score: 0.53277, above baseline by 3 picks
ts <- fixplans(0.05, 500, 0.05)
pred <- data.frame(customer_ID = ts$customer_ID, plan = ts$planpred)
write.csv(pred, file="submit25.csv", row.names=FALSE, quote=FALSE)
# PART 11:
# predict change (for 12403 people, 100+ plans, 98 fixes)
glm.fit <- glm(changed ~ state+cost+A+C+D+E+F+G+age_oldest+age_youngest+
car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+C_previous+
duration_previous+stability+planfreq, data=trainex2, family=binomial)
glm.probs <- predict(glm.fit, newdata=testsub, type="response")
glm.pred <- ifelse(glm.probs>0.5, "Yes", "No")
table(glm.pred)
ts <- fixplans(0.2, 10, 0.05)
ts$changed <- as.factor(glm.pred)
nrow(ts[ts$planpred!=ts$plan & ts$changed=="Yes", ])
ts$planpred <- ifelse(ts$changed=="No", ts$plan, ts$planpred)
nrow(ts[ts$planpred!=ts$plan,])
# submit: 0.53787, 1 below baseline
pred <- data.frame(customer_ID = ts$customer_ID, plan = ts$planpred)
write.csv(pred, file="submit24.csv", row.names=FALSE, quote=FALSE)
Jump to Line
Something went wrong with that request. Please try again.