In [None]:
# TBD - update to new features format

library(data.table)
library(arules)
library(arulesViz)

DATA_DIR_NAME <- '/Users/karenblakemore/merck/data/'
confidence <- .01

# For association rules
name_suffix <- '_association_rules.csv'
maxlength <- 10
support <- .0005

DATA_SET_NAME <- 'tmdb_5000_movies_prepped'

options(warn = -1)

df <- read.csv(paste(DATA_DIR_NAME, DATA_SET_NAME, "_features.csv", sep=""), na.strings=c(""), check.names=FALSE)

head(df)

df[] <- lapply(df, factor) # All columns must be of type factor

N <- nrow(df)             # save number of rows for results data set
                                    
head(df)

sapply(df, class)

In [None]:
trans <- as(df, "transactions")
inspect(head(trans))
summary(trans)

In [None]:
itemsets <- eclat(trans, parameter = list(supp = support, maxlen=maxlength))
inspect(head(sort(itemsets, by="support")))
summary(itemsets)

In [None]:
items.subsets <- subset(itemsets, subset = items %pin% "revenue=")
items.subsets <- subset(items.subsets, subset = items %in% "budget=[0, 38000000)")
items.subsets <- subset(items.subsets, subset = !(items %in% "revenue=[0, 278796508)"))
inspect(head(sort(items.subsets, by="support")))
summary(items.subsets)

In [None]:
rules <- ruleInduction(items.subsets, trans, confidence=confidence, control=list(verbose=TRUE))

In [None]:
# Filter rows, keeping only those with lift > 1.1
rules <- subset(rules, lift > 1.1)
inspect(head(sort(rules, by="lift")))
summary(rules)

In [None]:
rules.revenue <- subset(rules, subset = rhs %pin% "revenue=")
inspect(head(sort(rules.revenue, by="lift")))
summary(rules.revenue)

In [None]:
# Interactive Data Table
p <- inspectDT(rules.revenue)
htmlwidgets::saveWidget(p, "rules.html", selfcontained = FALSE)
browseURL("rules.html")