In [1]:
suppressPackageStartupMessages(library(readxl))    # free data from excel hades
suppressPackageStartupMessages(library(dplyr))     # sane data manipulation
suppressPackageStartupMessages(library(tidyr))     # sane data munging
suppressPackageStartupMessages(library(viridis))   # sane colors
suppressPackageStartupMessages(library(ggplot2))   # needs no introduction
suppressPackageStartupMessages(library(ggfortify)) # super-helpful for plotting non-"standard" stats objects


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: grid
Loading required package: scales
Loading required package: proto


In [None]:
url <- "http://blog.yhathq.com/static/misc/data/WineKMC.xlsx"
fil <- basename(url)
if (!file.exists(fil)) download.file(url, fil)

In [None]:
offers <- read_excel(fil, sheet = 1)
colnames(offers) <- c("offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak")
head(offers)

In [None]:
transactions <- read_excel(fil, sheet = 2)
colnames(transactions) <- c("customer_name", "offer_id")
transactions$n <- 1
head(transactions)

In [None]:
# join the offers and transactions table
left_join(offers, transactions, by="offer_id") %>% 
# get the number of times each customer responded to a given offer
  count(customer_name, offer_id, wt=n) %>%
# change it from long to wide
  spread(offer_id, n) %>%
# and fill in the NAs that get generated as a result
  mutate_each(funs(ifelse(is.na(.), 0, .))) -> dat

In [None]:
fit <- kmeans(dat[,-1], 5, iter.max=1000)
table(fit$cluster)
barplot(table(fit$cluster), col="maroon")

In [None]:
pca <- prcomp(dat[,-1])
pca_dat <- mutate(fortify(pca), col=fit$cluster)
ggplot(pca_dat) +
  geom_point(aes(x=PC1, y=PC2, fill=factor(col)), size=3, col="#7f7f7f", shape=21) +
  scale_fill_viridis(name="Cluster", discrete=TRUE) + theme_bw(base_family="Helvetica")

In [None]:
autoplot(fit, data=dat[,-1], frame=TRUE, frame.type='norm')

In [None]:
transactions %>% 
  left_join(data_frame(customer_name=dat$customer_name, 
                       cluster=fit$cluster)) %>% 
  left_join(offers) -> customer_clusters

customer_clusters %>% 
  mutate(is_4=(cluster==4)) %>% 
  count(is_4, varietal) -> varietal_4

varietal_4

In [None]:
varietal_4

In [None]:
customer_clusters %>% 
  mutate(is_4=(cluster==4)) %>% 
  group_by(is_4) %>% 
  summarise_each(funs(mean), min_qty, discount) -> mean_4

mean_4