In [None]:
# Change to new features format

library(data.table)
library(arules)
library(arulesViz)

DATA_DIR_NAME <- '/Users/karenblakemore/merck/data/'
confidence <- .5

# For association rules
name_suffix <- '_association_rules.csv'
maxlength <- 4
support <- .0019

DATA_SET_NAME <- 'tmdb_5000_movies_horror'

options(warn = -1)

df <- read.csv(paste(DATA_DIR_NAME, DATA_SET_NAME, "_features.csv", sep=""), na.strings=c(""), check.names=FALSE)

head(df)

df[] <- lapply(df, factor) # All columns must be of type factor

N <- nrow(df)             # save number of rows for results data set
                                    
head(df)

sapply(df, class)

In [None]:
trans <- as(df, "transactions")
inspect(head(trans))
summary(trans)

In [None]:
itemsets <- eclat(trans, parameter = list(supp = support, maxlen=maxlength))
inspect(head(sort(itemsets, by="support")))
summary(itemsets)

In [None]:
items.subsets <- subset(itemsets, subset = items %pin% "revenue=")
items.subsets <- subset(itemsets, subset = items %pin% "budget=")
inspect(head(sort(items.subsets, by="support")))
summary(items.subsets)

In [None]:
rules <- ruleInduction(items.subsets, trans, confidence=confidence, control=list(verbose=TRUE))

In [None]:
# Filter rows, keeping only those with lift > 1.1
rules <- subset(rules, lift > 1.1)
inspect(head(sort(rules, by="lift")))
summary(rules)

In [None]:
inspect(head(sort(rules, by="lift")))
summary(rules)

In [None]:
rules.revenue <- subset(rules, subset = lhs %in% "budget=[0, 20000000)" & rhs %pin% "revenue=")
inspect(head(sort(rules.revenue, by="lift")))
summary(rules.revenue)

In [None]:
# Interactive Data Table
p <- inspectDT(rules.revenue)
htmlwidgets::saveWidget(p, "rules.html", selfcontained = FALSE)
browseURL("rules.html")

In [None]:
itemFrequencyPlot(trans, support=.25, cex.names=0.8)

In [None]:
plot(rules.revenue, shading=c('confidence'))

In [None]:
plot(rules.revenue, engine='htmlwidget', shading='confidence')

In [None]:
plot(rules.revenue, method='matrix', engine='htmlwidget', shading=c('confidence'))

In [None]:
plot(rules.revenue, method='graph', engine='htmlwidget', max=100)

In [None]:
# Separate rule into LHS & RHS, add count of LHS, sort in descending order by (lift, confidence, support)
rules_df <- as(rules.revenue, "data.frame")
head(rules_df)

lhs <- as(lhs(rules.revenue), "list")
rules_df$length_LHS <- sapply(lhs, length)
rules_df$LHS <- sapply(lhs, paste0, collapse=";")

rules_df$RHS <- sapply(as(rhs(rules.revenue), "list"), paste0, collapse=";")

rules_df$rules <- NULL

rules_df$count <- interestMeasure(rules.revenue, method='count', transactions=trans)
rules_df$N <- as.vector(rep(N, nrow(rules_df)))

#rules_df$culster_gupta <- assign

rules_df <- rules_df[with(rules_df, order(-lift, -confidence, -support)), ]

head(rules_df)

CSV_FILE = paste(DATA_DIR_NAME, DATA_SET_NAME, name_suffix, sep="")
print(CSV_FILE)
write.table(rules_df, file=CSV_FILE, row.names=FALSE, sep=',')
