# HW 01
#### Name: Joanie Gannon
#### Name: Jake Schaeffer
#### Class: CSCI 349 - Intro to Data Mining
#### Semester: Spring 2020
#### Instructor: Brian King

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules

# Phase I

The ratings file is a log of movies their customers have watched. For the first phase of the project, you can keep the
problem simple. Ignore the actual numeric rating and timestamp variables, and convert the ratings file into a set of
transactions, where universe of all possible items are movies. Then, each row is a customer, the items are actual
movies they watched. Your objective is to output a set of the strongest, most interesting association rules you
can. Try to generate at least 10-20 rules. A strong association rule can be interpreted as a potential
recommendation. Your rules must contain actual movie names, and not movie ids!

----
### Process:

- 1) Start by reading in cvs for ratings and movies
- 2) Looking at the data frame for ratings, we want to strip our ratings and timeestamps
- 3) Replace movieId column with corresponding movie title
- 4) Binarize data and generate rules

In [3]:
df_ratings = pd.read_csv("../data/ml-latest-small/ratings.csv")
df_movies = pd.read_csv("../data/ml-latest-small/movies.csv")
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [4]:
#Drop unneeded data
df_ratings = df_ratings.drop(columns = ['rating','timestamp'], errors = "ignore")
df_movies.index = df_movies.movieId
df_movies = df_movies.drop(columns = ['genres', 'movieId'], errors = "ignore")

In [5]:
#Map movieId with title, drop unneed data
df_titles = df_ratings.merge(right = df_movies, right_on = 'movieId', left_on = 'movieId')
df_titles = df_titles.sort_values(['userId','movieId'])
df_titles = df_titles.reset_index()
df_titles = df_titles.drop(columns = ['movieId','index'], errors = "ignore")
df_titles

Unnamed: 0,userId,title
0,1,Toy Story (1995)
1,1,Grumpier Old Men (1995)
2,1,Heat (1995)
3,1,Seven (a.k.a. Se7en) (1995)
4,1,"Usual Suspects, The (1995)"
...,...,...
100831,610,Split (2017)
100832,610,John Wick: Chapter Two (2017)
100833,610,Get Out (2017)
100834,610,Logan (2017)


In [6]:
#Let's make the title a categorical
title_cat = pd.Categorical(df_titles.title)
df_titles.title = title_cat
df_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 2 columns):
userId    100836 non-null int64
title     100836 non-null category
dtypes: category(1), int64(1)
memory usage: 1.3 MB


In [7]:
#Now we binarize the data
df_movies_binarized = pd.get_dummies(data = df_titles.title)
df_movies_binarized = df_movies_binarized.set_index(df_titles.userId)
df_movies_binarized = df_movies_binarized.groupby("userId").max()
df_movies_binarized

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
607,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
608,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
609,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#Generate supports using fpgrowth for computational speed
fp_support = fpgrowth(df_movies_binarized, min_support=0.3, use_colnames=True)
fp_support

Unnamed: 0,support,itemsets
0,0.539344,(Forrest Gump (1994))
1,0.503279,(Pulp Fiction (1994))
2,0.457377,"(Silence of the Lambs, The (1991))"
3,0.455738,"(Matrix, The (1999))"
4,0.411475,(Star Wars: Episode IV - A New Hope (1977))
5,0.390164,(Jurassic Park (1993))
6,0.388525,(Braveheart (1995))
7,0.360656,(Schindler's List (1993))
8,0.357377,(Fight Club (1999))
9,0.352459,(Toy Story (1995))


In [9]:
#Generated rule
ars = association_rules(fp_support, metric = "lift", min_threshold=1.3)
ars = ars.sort_values(by = "confidence", ascending = False)
ars

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
16,(Star Wars: Episode V - The Empire Strikes Bac...,(Star Wars: Episode IV - A New Hope (1977)),0.345902,0.411475,0.311475,0.900474,2.188403,0.169145,5.913271
13,(Jurassic Park (1993)),(Forrest Gump (1994)),0.390164,0.539344,0.32459,0.831933,1.542489,0.114157,2.740902
14,(Braveheart (1995)),(Forrest Gump (1994)),0.388525,0.539344,0.3,0.772152,1.431649,0.090451,2.021767
17,(Star Wars: Episode IV - A New Hope (1977)),(Star Wars: Episode V - The Empire Strikes Bac...,0.411475,0.345902,0.311475,0.756972,2.188403,0.169145,2.691454
0,(Pulp Fiction (1994)),(Forrest Gump (1994)),0.503279,0.539344,0.377049,0.749186,1.389068,0.105609,1.83664
4,"(Silence of the Lambs, The (1991))",(Pulp Fiction (1994)),0.457377,0.503279,0.339344,0.741935,1.474204,0.109156,1.924795
10,(Star Wars: Episode IV - A New Hope (1977)),"(Matrix, The (1999))",0.411475,0.455738,0.3,0.729084,1.599788,0.112475,2.008968
18,"(Shawshank Redemption, The (1994))",(Forrest Gump (1994)),0.519672,0.539344,0.378689,0.728707,1.351097,0.098406,1.697998
3,(Pulp Fiction (1994)),"(Shawshank Redemption, The (1994))",0.503279,0.519672,0.363934,0.723127,1.391506,0.102395,1.734831
8,"(Silence of the Lambs, The (1991))","(Shawshank Redemption, The (1994))",0.457377,0.519672,0.32623,0.713262,1.372522,0.088543,1.675143


In [10]:
#Print rules
for i in range(len(ars)): 
    antecedents = list(ars.iloc[i].antecedents)
    consequents = list(ars.iloc[i].consequents)
    #print(ars.iloc[i].consequents)
    print("{}   ->   {}".format(antecedents, consequents))

['Star Wars: Episode V - The Empire Strikes Back (1980)']   ->   ['Star Wars: Episode IV - A New Hope (1977)']
['Jurassic Park (1993)']   ->   ['Forrest Gump (1994)']
['Braveheart (1995)']   ->   ['Forrest Gump (1994)']
['Star Wars: Episode IV - A New Hope (1977)']   ->   ['Star Wars: Episode V - The Empire Strikes Back (1980)']
['Pulp Fiction (1994)']   ->   ['Forrest Gump (1994)']
['Silence of the Lambs, The (1991)']   ->   ['Pulp Fiction (1994)']
['Star Wars: Episode IV - A New Hope (1977)']   ->   ['Matrix, The (1999)']
['Shawshank Redemption, The (1994)']   ->   ['Forrest Gump (1994)']
['Pulp Fiction (1994)']   ->   ['Shawshank Redemption, The (1994)']
['Silence of the Lambs, The (1991)']   ->   ['Shawshank Redemption, The (1994)']
['Silence of the Lambs, The (1991)']   ->   ['Forrest Gump (1994)']
['Forrest Gump (1994)']   ->   ['Shawshank Redemption, The (1994)']
['Shawshank Redemption, The (1994)']   ->   ['Pulp Fiction (1994)']
['Forrest Gump (1994)']   ->   ['Pulp Fiction (19

---
### Discuss Finding Phase I

We find that several popular movies such as Jurrasic Park, Forrest Gump, Bravehart, ect. imply that other popular movies in the same category are viewed as well. The most interesting finding is that Star Wars V and IV are often watched together, and StarWars and The Matrix are also watched together frequently


# Phase II - Genre

The client is interested in a restricted set of rules for specific genres. For this task, demonstrate your skill by
selecting a genre of your own choosing. Select the subset of movies that match that genre, and rerun your rule
generation algorithm. For example, if the genre is "Comedy", then all ratings of movies that have Comedy in the
genre list should be selected. Run your algorithm on that subset, and generate a small set of strong rules. REPEAT
THIS FOR THREE DIFFERENT GENRES OF YOUR OWN CHOOSING.
Discuss – is this a better method than considering all movies? Or worse?


-----

### Process:

- 1) Choose a genre
- 2) Filter movies by chosen genre
- 3) Rerun phase 1
- 4) Repeat previous steps for 2 other genres

In [11]:
#Let's choose comdey, action, and horror
df_ratings = pd.read_csv("../data/ml-latest-small/ratings.csv")
df_movies = pd.read_csv("../data/ml-latest-small/movies.csv")
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [12]:
#Strip out only comedies
comedy = df_movies[df_movies.genres.str.contains("Comedy")]
comedy

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
6,7,Sabrina (1995),Comedy|Romance
...,...,...,...
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi
9734,193571,Silver Spoon (2014),Comedy|Drama
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy


In [13]:
#Strip out only action
action = df_movies[df_movies.genres.str.contains("Action")]
action

Unnamed: 0,movieId,title,genres
5,6,Heat (1995),Action|Crime|Thriller
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller
14,15,Cutthroat Island (1995),Action|Adventure|Romance
19,20,Money Train (1995),Action|Comedy|Crime|Drama|Thriller
...,...,...,...
9722,189547,Iron Soldier (2010),Action|Sci-Fi
9731,191005,Gintama (2017),Action|Adventure|Comedy|Sci-Fi
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy


In [14]:
#Strip out only Horror
horror = df_movies[df_movies.genres.str.contains("Horror")]
horror

Unnamed: 0,movieId,title,genres
11,12,Dracula: Dead and Loving It (1995),Comedy|Horror
21,22,Copycat (1995),Crime|Drama|Horror|Mystery|Thriller
62,70,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller
81,92,Mary Reilly (1996),Drama|Horror|Thriller
82,93,Vampire in Brooklyn (1995),Comedy|Horror|Romance
...,...,...,...
9651,180263,The Shining (1997),Drama|Horror|Thriller
9678,183295,Insidious: The Last Key (2018),Horror|Mystery|Thriller
9681,183611,Game Night (2018),Action|Comedy|Crime|Horror
9689,184253,The Cloverfield Paradox (2018),Horror|Mystery|Sci-Fi|Thriller


In [15]:
#Make a function that runs phase 1

def phaseI (df_movies, df_ratings, ms):
    #clean the data
    df_ratings = df_ratings.drop(columns = ['rating','timestamp'], errors = "ignore")
    df_movies.index = df_movies.movieId
    df_movies = df_movies.drop(columns = ['movieId'], errors = "ignore")
    df_titles = df_ratings.merge(right = df_movies, right_on = 'movieId', left_on = 'movieId')
    df_titles = df_titles.sort_values(['userId','movieId'])
    df_titles = df_titles.reset_index()
    df_titles = df_titles.drop(columns = ['movieId','index'], errors = "ignore")
    #Categorical
    title_cat = pd.Categorical(df_titles.title)
    df_titles.title = title_cat
    #Now we binarize the data
    df_movies_binarized = pd.get_dummies(data = df_titles.title)
    df_movies_binarized = df_movies_binarized.set_index(df_titles.userId)
    df_movies_binarized = df_movies_binarized.groupby("userId").max()
    #Generate supports
    fp_support = fpgrowth(df_movies_binarized, min_support=ms, use_colnames=True)
    #Generated rules
    ars = association_rules(fp_support, metric = "lift", min_threshold=1.3)
    ars = ars.sort_values(by = "confidence", ascending = False)
    #Print rules
    for i in range(len(ars)): 
        antecedents = list(ars.iloc[i].antecedents)
        consequents = list(ars.iloc[i].consequents)
        print("{}   ->   {}".format(antecedents, consequents))

In [16]:
print("COMEDY:")
phaseI(comedy, df_ratings, 0.22)

COMEDY:
['Mrs. Doubtfire (1993)']   ->   ['Forrest Gump (1994)']
['Men in Black (a.k.a. MIB) (1997)']   ->   ['Forrest Gump (1994)']
['Fargo (1996)']   ->   ['Pulp Fiction (1994)']
['Back to the Future (1985)']   ->   ['Forrest Gump (1994)']
['True Lies (1994)']   ->   ['Forrest Gump (1994)']
['True Lies (1994)']   ->   ['Pulp Fiction (1994)']
['Aladdin (1992)']   ->   ['Forrest Gump (1994)']
['Pulp Fiction (1994)']   ->   ['Forrest Gump (1994)']
['Toy Story (1995)']   ->   ['Forrest Gump (1994)']
['Forrest Gump (1994)']   ->   ['Pulp Fiction (1994)']
['Toy Story (1995)']   ->   ['Pulp Fiction (1994)']
['Pulp Fiction (1994)']   ->   ['Fargo (1996)']
['Forrest Gump (1994)']   ->   ['Toy Story (1995)']
['Pulp Fiction (1994)']   ->   ['Toy Story (1995)']
['Pulp Fiction (1994)']   ->   ['True Lies (1994)']
['Forrest Gump (1994)']   ->   ['Aladdin (1992)']
['Forrest Gump (1994)']   ->   ['True Lies (1994)']
['Forrest Gump (1994)']   ->   ['Back to the Future (1985)']
['Forrest Gump (1994)']

In [17]:
print("ACTION:")
phaseI(action, df_ratings, 0.265)

ACTION:
['Star Wars: Episode V - The Empire Strikes Back (1980)', 'Matrix, The (1999)']   ->   ['Star Wars: Episode IV - A New Hope (1977)']
['Star Wars: Episode V - The Empire Strikes Back (1980)']   ->   ['Star Wars: Episode IV - A New Hope (1977)']
['Star Wars: Episode VI - Return of the Jedi (1983)']   ->   ['Star Wars: Episode IV - A New Hope (1977)']
['Star Wars: Episode IV - A New Hope (1977)', 'Matrix, The (1999)']   ->   ['Star Wars: Episode V - The Empire Strikes Back (1980)']
['Star Wars: Episode V - The Empire Strikes Back (1980)', 'Star Wars: Episode IV - A New Hope (1977)']   ->   ['Matrix, The (1999)']
['Star Wars: Episode VI - Return of the Jedi (1983)']   ->   ['Star Wars: Episode V - The Empire Strikes Back (1980)']
['Fight Club (1999)']   ->   ['Matrix, The (1999)']
['Star Wars: Episode V - The Empire Strikes Back (1980)']   ->   ['Matrix, The (1999)']
['Star Wars: Episode V - The Empire Strikes Back (1980)']   ->   ['Star Wars: Episode VI - Return of the Jedi (1983)

In [18]:
print("HORROR:")
phaseI(horror, df_ratings, 0.14)

HORROR:
['Sixth Sense, The (1999)', 'Alien (1979)']   ->   ['Silence of the Lambs, The (1991)']
['Aliens (1986)']   ->   ['Alien (1979)']
['Shining, The (1980)']   ->   ['Silence of the Lambs, The (1991)']
['Silence of the Lambs, The (1991)', 'Alien (1979)']   ->   ['Sixth Sense, The (1999)']
['Alien (1979)']   ->   ['Aliens (1986)']
['Interview with the Vampire: The Vampire Chronicles (1994)']   ->   ['Silence of the Lambs, The (1991)']
['Shining, The (1980)']   ->   ['Sixth Sense, The (1999)']
['Alien (1979)']   ->   ['Silence of the Lambs, The (1991)']
['Sixth Sense, The (1999)']   ->   ['Silence of the Lambs, The (1991)']
['Silence of the Lambs, The (1991)', 'Sixth Sense, The (1999)']   ->   ['Alien (1979)']
['Aliens (1986)']   ->   ['Sixth Sense, The (1999)']
['Alien (1979)']   ->   ['Sixth Sense, The (1999)']
['Alien (1979)']   ->   ['Silence of the Lambs, The (1991)', 'Sixth Sense, The (1999)']
['Sixth Sense, The (1999)']   ->   ['Alien (1979)']
['Silence of the Lambs, The (1991

### Discuss Finding Phase II

This splitting of the data seems to be better since the results are less generic. In phase I, all of the top movies were clumped together. Whereas here, it's more likely that you will find movies specific to one's taste. In addition, we found that using lift for our min threshold helped to minimize the number of rules we had and only give us rules that had high correlation.

---

# Phase III – Genre Rules
The client has a bright idea. (Being a good agile developer, you eagerly respond positively, to ensure the client
knows they are valued and part of your team. J ) The client wants to take a more general view of genre. How?
Create a new transaction dataset, where the item universe is now all possible genres, not movies. A transaction for
each customer is then a list of genres collected over all movies they watched. The customer wants to understand
both the general frequent patterns among these data and their support levels. Again, be sure to output a good set
of strong rules. This can help the customer determine what types of movies they should invest in the most based
on current genres most watched. (NOTE: This is going to amount to a very dense dataset, compared to the movies,
and thus will require very different hyperparameters.)

____

### Process:

- 1) Start by reading in cvs for ratings and movies
- 2) Looking at the data frame for ratings, we want to strip our ratings and timeestamps
- 3) Replace movieId column with corresponding movie title
- 4) Create genre dataframe that only contains userIds and their associated genres 
- 5) Binarize data and generate rules

In [19]:
df_ratings = pd.read_csv("../data/ml-latest-small/ratings.csv")
df_movies = pd.read_csv("../data/ml-latest-small/movies.csv")
df_ratings = df_ratings.drop(columns = ['rating','timestamp'], errors = "ignore")

In [20]:
df_titles = df_ratings.merge(right = df_movies, right_on = 'movieId', left_on = 'movieId')
df_titles = df_titles.sort_values(['userId','movieId'])
df_titles = df_titles.drop(["movieId","title"], axis = 1)

In [21]:
# to get genres individual, we must split the string
df_titles.genres = df_titles.genres.str.split("|")

In [22]:
#explode the lists that resulted from the split into seperate rows
#userGenres has userId as index and has genres with all genres they like
userGenres = df_titles.explode("genres").drop_duplicates().set_index("userId")

In [23]:
genre_binarized = pd.get_dummies(data = userGenres.genres).groupby("userId").max()
#we now have a binarized list of genres watched by UserId
#take out IMAX, which is not a genre.
genre_binarized = genre_binarized.drop("IMAX",axis = 1)

In [24]:
#Generate supports
genre_support = fpgrowth(genre_binarized, min_support=0.8, use_colnames=True)
#even with min support of .8, since the dataset is dense, we get 6400 results
genre_rules = association_rules(genre_support, metric = "lift", min_threshold=1.1)
genre_rules = genre_rules.sort_values(by = "confidence", ascending = False)
genre_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
21,"(Action, Animation, Mystery)","(Children, Comedy, Drama, Fantasy, Sci-Fi)",0.827869,0.881967,0.803279,0.970297,1.100151,0.073126,3.97377
36,"(Action, Animation, Adventure, Mystery)","(Children, Comedy, Drama, Fantasy, Sci-Fi)",0.827869,0.881967,0.803279,0.970297,1.100151,0.073126,3.97377
20,"(Action, Animation, Mystery, Drama)","(Children, Sci-Fi, Fantasy, Comedy)",0.827869,0.881967,0.803279,0.970297,1.100151,0.073126,3.97377
32,"(Adventure, Drama, Action, Animation, Mystery)","(Children, Sci-Fi, Fantasy, Comedy)",0.827869,0.881967,0.803279,0.970297,1.100151,0.073126,3.97377
33,"(Comedy, Drama, Action, Animation, Mystery)","(Children, Sci-Fi, Adventure, Fantasy)",0.827869,0.881967,0.803279,0.970297,1.100151,0.073126,3.97377
...,...,...,...,...,...,...,...,...,...
115,"(Children, Sci-Fi, Adventure, Fantasy)","(Action, Animation, Thriller, Mystery, Romance)",0.881967,0.824590,0.800000,0.907063,1.100017,0.072739,1.88741
137,"(Children, Sci-Fi, Fantasy, Comedy)","(Drama, Action, Animation, Thriller, Mystery, ...",0.881967,0.824590,0.800000,0.907063,1.100017,0.072739,1.88741
161,"(Children, Sci-Fi, Fantasy, Comedy)","(Adventure, Action, Animation, Thriller, Myste...",0.881967,0.824590,0.800000,0.907063,1.100017,0.072739,1.88741
135,"(Children, Comedy, Drama, Fantasy, Sci-Fi)","(Action, Animation, Thriller, Mystery, Romance)",0.881967,0.824590,0.800000,0.907063,1.100017,0.072739,1.88741


In [25]:
ars = genre_rules.head(15)
for i in range(len(ars)): 
        antecedents = list(ars.iloc[i].antecedents)
        consequents = list(ars.iloc[i].consequents)
        print("{}   ->   {}".format(antecedents, consequents))

['Action', 'Animation', 'Mystery']   ->   ['Children', 'Comedy', 'Drama', 'Fantasy', 'Sci-Fi']
['Action', 'Animation', 'Adventure', 'Mystery']   ->   ['Children', 'Comedy', 'Drama', 'Fantasy', 'Sci-Fi']
['Action', 'Animation', 'Mystery', 'Drama']   ->   ['Children', 'Sci-Fi', 'Fantasy', 'Comedy']
['Adventure', 'Drama', 'Action', 'Animation', 'Mystery']   ->   ['Children', 'Sci-Fi', 'Fantasy', 'Comedy']
['Comedy', 'Drama', 'Action', 'Animation', 'Mystery']   ->   ['Children', 'Sci-Fi', 'Adventure', 'Fantasy']
['Action', 'Animation', 'Mystery', 'Drama']   ->   ['Children', 'Sci-Fi', 'Adventure', 'Fantasy']
['Action', 'Mystery', 'Animation', 'Comedy']   ->   ['Children', 'Sci-Fi', 'Adventure', 'Fantasy']
['Action', 'Animation', 'Adventure', 'Mystery']   ->   ['Children', 'Sci-Fi', 'Fantasy', 'Comedy']
['Action', 'Animation', 'Mystery']   ->   ['Children', 'Adventure', 'Comedy', 'Fantasy', 'Sci-Fi']
['Action', 'Mystery', 'Animation', 'Comedy']   ->   ['Children', 'Adventure', 'Drama', 'Fan

# Phase IV – Incorporating Additional Variables
Consider how you can use other variables? You have access to the numeric ratings, a unique timestamp for the
rating, the year of the movie, and user-defined tags. Or, consider that, for the previous exercise, you ignored
multiplicity of genres. What else can you do with all of these data? For instance, are there patterns with movie
years? Could you create new items such as "70s", "80s", and so on for the decade of the movie and re-run your
frequent pattern search? Could you combine the decade and the genre? Imagine if you could figure out how to
generate rules that tell the client that people who like 80s movies are likely to watch "Comedy" or "Romance" with
a given confidence level. And of course, what about the ratings!!! Why would you output a rule that contains a
movie only given a rating of a 1 or a 2? You might be able filter these patterns and rules more intelligently!

For this last phase, come up with three different ideas that involve including additional variables in some way, and
implement it. In all three cases, generate a new set of association rules. Depending on what you choose to do here,
it will likely require that you filter rules out that do not meet certain criteria? Or, perhaps you could modify or
rewrite your own variant of the apriori algorithm. You could rewrite apriori just to generate relevant frequent
patterns, and still use mlxtend's association rules package, as long as the format of the data frame that is used as
input into the association rule generation are consistent.

I have no specific requirements here. I want you and your partner to think. Be creative. Put yourself in the client's
shoes. You have a lot of data. How can you leverage it to provide the best possible recommendations for their
customers?

------

## 1) If a user likes a movie, what else do they like?
### Process:

- 1) We'll start by defining a review lower than 3 as a negative review.
- 2) Drop reviews with less than 3 in the review
- 3) Binarize data and generate rules

In [26]:
#Read the data
df_ratings = pd.read_csv("../data/ml-latest-small/ratings.csv")
df_movies = pd.read_csv("../data/ml-latest-small/movies.csv")
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [27]:
#Drop uneeded data
df_ratings = df_ratings.drop(columns = ['timestamp'], errors = "ignore")
df_movies.index = df_movies.movieId
df_movies = df_movies.drop(columns = ['movieId'], errors = "ignore")

In [28]:
#Map movieId with title
df_titles = df_ratings.merge(right = df_movies, right_on = 'movieId', left_on = 'movieId')
df_titles = df_titles.sort_values(['userId','movieId'])
df_titles = df_titles.reset_index()
df_titles = df_titles.drop(columns = ['movieId','index'], errors = "ignore")
df_titles

Unnamed: 0,userId,rating,title,genres
0,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,4.0,Grumpier Old Men (1995),Comedy|Romance
2,1,4.0,Heat (1995),Action|Crime|Thriller
3,1,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...
100831,610,4.0,Split (2017),Drama|Horror|Thriller
100832,610,5.0,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,5.0,Get Out (2017),Horror
100834,610,5.0,Logan (2017),Action|Sci-Fi


In [29]:
#Drop ratings under 3
df_titles = df_titles[df_titles.rating > 3]
df_titles

Unnamed: 0,userId,rating,title,genres
0,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,4.0,Grumpier Old Men (1995),Comedy|Romance
2,1,4.0,Heat (1995),Action|Crime|Thriller
3,1,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...
100830,610,4.0,Rogue One: A Star Wars Story (2016),Action|Adventure|Fantasy|Sci-Fi
100831,610,4.0,Split (2017),Drama|Horror|Thriller
100832,610,5.0,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,5.0,Get Out (2017),Horror


In [30]:
#Let's make the title a categorical
title_cat = pd.Categorical(df_titles.title)
df_titles.title = title_cat
df_titles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61716 entries, 0 to 100834
Data columns (total 4 columns):
userId    61716 non-null int64
rating    61716 non-null float64
title     61716 non-null category
genres    61716 non-null object
dtypes: category(1), float64(1), int64(1), object(1)
memory usage: 2.4+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [31]:
#Now we binarize the data
df_movies_binarized = pd.get_dummies(data = df_titles.title)
df_movies_binarized = df_movies_binarized.set_index(df_titles.userId)
df_movies_binarized = df_movies_binarized.groupby("userId").max()
df_movies_binarized

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),00 Schneider - Jagd auf Nihil Baxter (1994),...,"Zone, The (La Zona) (2007)",Zookeeper (2011),Zoolander (2001),Zootopia (2016),Zulu (1964),[REC] (2007),[REC]² (2009),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
607,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
608,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
609,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
#Generate supports
fp_support = fpgrowth(df_movies_binarized, min_support=0.1, use_colnames=True)
fp_support

Unnamed: 0,support,itemsets
0,0.453202,(Forrest Gump (1994))
1,0.394089,"(Matrix, The (1999))"
2,0.392447,"(Silence of the Lambs, The (1991))"
3,0.361248,(Star Wars: Episode IV - A New Hope (1977))
4,0.321839,(Fight Club (1999))
...,...,...
3243,0.103448,"(Silence of the Lambs, The (1991), Truman Show..."
3244,0.101806,"(Truman Show, The (1998), Saving Private Ryan ..."
3245,0.113300,"(Fight Club (1999), Truman Show, The (1998))"
3246,0.101806,"(Truman Show, The (1998), Lord of the Rings: T..."


In [33]:
#Generated rules
ars = association_rules(fp_support, metric = "confidence", min_threshold=.99)
ars = ars.sort_values(by = "confidence", ascending = False)
ars

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(Indiana Jones and the Last Crusade (1989), Lo...","(Lord of the Rings: The Return of the King, Th...",0.101806,0.262726,0.101806,1.0,3.80625,0.075059,inf
1,"(Lord of the Rings: The Return of the King, Th...",(Lord of the Rings: The Fellowship of the Ring...,0.106732,0.275862,0.106732,1.0,3.625,0.077289,inf
20,"(Godfather: Part II, The (1974), Goodfellas (1...","(Godfather, The (1972))",0.1133,0.270936,0.1133,1.0,3.690909,0.082603,inf
19,"(Godfather: Part II, The (1974), Pulp Fiction ...","(Godfather, The (1972))",0.100164,0.270936,0.100164,1.0,3.690909,0.073026,inf
18,"(Godfather: Part II, The (1974), Silence of th...","(Godfather, The (1972))",0.118227,0.270936,0.118227,1.0,3.690909,0.086195,inf
17,"(Godfather: Part II, The (1974), Pulp Fiction ...","(Godfather, The (1972))",0.10509,0.270936,0.10509,1.0,3.690909,0.076618,inf
16,"(Godfather: Part II, The (1974), Fight Club (1...","(Godfather, The (1972))",0.119869,0.270936,0.119869,1.0,3.690909,0.087392,inf
15,"(Godfather: Part II, The (1974), Pulp Fiction ...","(Godfather, The (1972))",0.101806,0.270936,0.101806,1.0,3.690909,0.074223,inf
14,"(Godfather: Part II, The (1974), Pulp Fiction ...","(Godfather, The (1972))",0.108374,0.270936,0.108374,1.0,3.690909,0.079012,inf
13,"(Godfather: Part II, The (1974), Usual Suspect...","(Godfather, The (1972))",0.124795,0.270936,0.124795,1.0,3.690909,0.090983,inf


In [34]:
#Print rules
for i in range(len(ars)): 
    antecedents = list(ars.iloc[i].antecedents)
    consequents = list(ars.iloc[i].consequents)
    #print(ars.iloc[i].consequents)
    print("{}   ->   {}".format(antecedents, consequents))

['Indiana Jones and the Last Crusade (1989)', 'Lord of the Rings: The Two Towers, The (2002)', 'Lord of the Rings: The Fellowship of the Ring, The (2001)']   ->   ['Lord of the Rings: The Return of the King, The (2003)']
['Lord of the Rings: The Return of the King, The (2003)', 'Lord of the Rings: The Two Towers, The (2002)', 'Inception (2010)']   ->   ['Lord of the Rings: The Fellowship of the Ring, The (2001)']
['Godfather: Part II, The (1974)', 'Goodfellas (1990)']   ->   ['Godfather, The (1972)']
['Godfather: Part II, The (1974)', 'Pulp Fiction (1994)', 'American Beauty (1999)']   ->   ['Godfather, The (1972)']
['Godfather: Part II, The (1974)', 'Silence of the Lambs, The (1991)']   ->   ['Godfather, The (1972)']
['Godfather: Part II, The (1974)', 'Pulp Fiction (1994)', 'Fight Club (1999)']   ->   ['Godfather, The (1972)']
['Godfather: Part II, The (1974)', 'Fight Club (1999)']   ->   ['Godfather, The (1972)']
['Godfather: Part II, The (1974)', 'Pulp Fiction (1994)', 'Matrix, The (

## 2) What genres are most closely related? I.e if you like one genre, what other genres are you likely to view
### Process:

- 1) Start by mapping genre onto each review transaction
- 2) Replace movieId column with corresponding movie title
- 3) Drop low rated movies
- 4) Binarize data and generate rules

In [35]:
df_ratings = pd.read_csv("../data/ml-latest-small/ratings.csv")
df_movies = pd.read_csv("../data/ml-latest-small/movies.csv")
df_ratings = df_ratings.drop(columns = ['timestamp'], errors = "ignore")

In [36]:
df_titles = df_ratings.merge(right = df_movies, right_on = 'movieId', left_on = 'movieId')
df_titles = df_titles.sort_values(['userId','movieId'])
df_titles = df_titles.drop(["movieId","title"], axis = 1)
df_titles

Unnamed: 0,userId,rating,genres
0,1,4.0,Adventure|Animation|Children|Comedy|Fantasy
215,1,4.0,Comedy|Romance
267,1,4.0,Action|Crime|Thriller
369,1,5.0,Mystery|Thriller
572,1,5.0,Crime|Mystery|Thriller
...,...,...,...
81464,610,4.0,Drama|Horror|Thriller
68922,610,5.0,Action|Crime|Thriller
81479,610,5.0,Horror
57554,610,5.0,Action|Sci-Fi


In [37]:
#Drop ratings under 3
df_titles = df_titles[df_titles.rating > 3]
df_titles

Unnamed: 0,userId,rating,genres
0,1,4.0,Adventure|Animation|Children|Comedy|Fantasy
215,1,4.0,Comedy|Romance
267,1,4.0,Action|Crime|Thriller
369,1,5.0,Mystery|Thriller
572,1,5.0,Crime|Mystery|Thriller
...,...,...,...
48424,610,4.0,Action|Adventure|Fantasy|Sci-Fi
81464,610,4.0,Drama|Horror|Thriller
68922,610,5.0,Action|Crime|Thriller
81479,610,5.0,Horror


In [38]:
#get rid of same genre list and userid to minimze size
df_titles = df_titles.drop_duplicates()
df_titles.reset_index(drop = True, inplace = True)

In [39]:
df_titles.genres = df_titles.genres.str.split("|")

In [40]:
#userGenres has userId as index and has genres with all genres they like
userGenres = df_titles.explode("genres").drop_duplicates().set_index("userId")

In [41]:
genre_binarized = pd.get_dummies(data = userGenres.genres).groupby("userId").max()
#we now have a binarized list of genres watched by UserId
genre_binarized = genre_binarized.drop("IMAX",axis = 1)
genre_binarized

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1
2,0,1,1,0,0,1,1,1,1,0,0,0,0,1,1,1,1,1,1
3,0,1,1,0,0,1,0,0,1,1,0,1,0,1,0,1,1,0,0
4,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
5,0,1,1,1,1,1,1,0,1,1,0,0,1,1,1,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
607,0,1,1,1,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1
608,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
609,0,1,1,0,0,1,1,0,1,0,0,1,0,0,1,0,1,1,1


In [41]:
#Generate supports
genre_support = fpgrowth(genre_binarized, min_support=0.8, use_colnames=True)
genre_support

Unnamed: 0,support,itemsets
0,0.995074,(Drama)
1,0.990148,(Comedy)
2,0.983580,(Thriller)
3,0.978654,(Action)
4,0.965517,(Romance)
...,...,...
761,0.806240,"(War, Thriller, Romance, Comedy, Action)"
762,0.806240,"(Thriller, War, Romance, Comedy, Drama, Action)"
763,0.801314,"(Drama, Crime, War, Romance)"
764,0.801314,"(Comedy, Crime, War, Romance)"


In [42]:
#even with min support of .8, since the dataset is dense, we get 6400 results
genre_rules = association_rules(genre_support, metric = "lift", min_threshold=1.062)
genre_rules = genre_rules.sort_values(by = "confidence", ascending = False)
genre_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
4,"(Fantasy, Thriller, Comedy, Drama, Action)","(Sci-Fi, Adventure, Romance)",0.844007,0.896552,0.80624,0.955253,1.065474,0.049544,2.311844
0,"(Fantasy, Drama, Action, Comedy)","(Sci-Fi, Adventure, Romance)",0.848933,0.896552,0.809524,0.953578,1.063607,0.048412,2.228448
2,"(Fantasy, Action, Thriller, Comedy)","(Sci-Fi, Adventure, Romance)",0.845649,0.896552,0.80624,0.953398,1.063406,0.048072,2.219828
5,"(Fantasy, Action, Thriller, Comedy)","(Sci-Fi, Drama, Adventure, Romance)",0.845649,0.896552,0.80624,0.953398,1.063406,0.048072,2.219828
6,"(Fantasy, Drama, Action, Comedy)","(Sci-Fi, Adventure, Thriller, Romance)",0.848933,0.893268,0.80624,0.94971,1.063186,0.047916,2.122332
1,"(Sci-Fi, Adventure, Romance)","(Fantasy, Drama, Action, Comedy)",0.896552,0.848933,0.809524,0.90293,1.063607,0.048412,1.556278
7,"(Sci-Fi, Adventure, Thriller, Romance)","(Fantasy, Drama, Action, Comedy)",0.893268,0.848933,0.80624,0.902574,1.063186,0.047916,1.550578
3,"(Sci-Fi, Adventure, Romance)","(Fantasy, Action, Thriller, Comedy)",0.896552,0.845649,0.80624,0.899267,1.063406,0.048072,1.532288
8,"(Sci-Fi, Drama, Adventure, Romance)","(Fantasy, Action, Thriller, Comedy)",0.896552,0.845649,0.80624,0.899267,1.063406,0.048072,1.532288
9,"(Sci-Fi, Adventure, Romance)","(Fantasy, Thriller, Comedy, Drama, Action)",0.896552,0.844007,0.80624,0.899267,1.065474,0.049544,1.548589


In [43]:
#Print rules
for i in range(len(genre_rules)): 
    antecedents = list(genre_rules.iloc[i].antecedents)
    consequents = list(genre_rules.iloc[i].consequents)
    #print(ars.iloc[i].consequents)
    print("{}   ->   {}".format(antecedents, consequents))

['Fantasy', 'Thriller', 'Comedy', 'Drama', 'Action']   ->   ['Sci-Fi', 'Adventure', 'Romance']
['Fantasy', 'Drama', 'Action', 'Comedy']   ->   ['Sci-Fi', 'Adventure', 'Romance']
['Fantasy', 'Action', 'Thriller', 'Comedy']   ->   ['Sci-Fi', 'Adventure', 'Romance']
['Fantasy', 'Action', 'Thriller', 'Comedy']   ->   ['Sci-Fi', 'Drama', 'Adventure', 'Romance']
['Fantasy', 'Drama', 'Action', 'Comedy']   ->   ['Sci-Fi', 'Adventure', 'Thriller', 'Romance']
['Sci-Fi', 'Adventure', 'Romance']   ->   ['Fantasy', 'Drama', 'Action', 'Comedy']
['Sci-Fi', 'Adventure', 'Thriller', 'Romance']   ->   ['Fantasy', 'Drama', 'Action', 'Comedy']
['Sci-Fi', 'Adventure', 'Romance']   ->   ['Fantasy', 'Action', 'Thriller', 'Comedy']
['Sci-Fi', 'Drama', 'Adventure', 'Romance']   ->   ['Fantasy', 'Action', 'Thriller', 'Comedy']
['Sci-Fi', 'Adventure', 'Romance']   ->   ['Fantasy', 'Thriller', 'Comedy', 'Drama', 'Action']


## 3) What tags frequently occur together?
### Process:

- 1) Start by mapping genre onto each review transaction
- 2) Replace movieId column with corresponding movie title
- 3) Drop low rated movies
- 4) Binarize data and generate rules

In [42]:
#Read the data
df_movies = pd.read_csv("../data/ml-latest-small/movies.csv")
df_tags = pd.read_csv("../data/ml-latest-small/tags.csv")
df_tags.describe()

Unnamed: 0,userId,movieId,timestamp
count,3683.0,3683.0,3683.0
mean,431.149335,27252.013576,1320032000.0
std,158.472553,43490.558803,172102500.0
min,2.0,1.0,1137179000.0
25%,424.0,1262.5,1137521000.0
50%,474.0,4454.0,1269833000.0
75%,477.0,39263.0,1498457000.0
max,610.0,193565.0,1537099000.0


In [43]:
#Drop uneeded data
df_tags = df_tags.drop(columns = ['timestamp','userId'], errors = "ignore")
df_movies = df_movies.drop(columns = ['genres'], errors = "ignore")

In [44]:
#Merge into one dataframe and drop duplicate tags
df_titles = df_tags.merge(right = df_movies, right_on = 'movieId', left_on = 'movieId')
df_titles = df_titles.sort_values(['movieId'])
df_titles = df_titles.drop(["movieId"], axis = 1)
df_titles = df_titles.drop_duplicates(keep = "first")
df_titles

Unnamed: 0,tag,title
1012,fun,Toy Story (1995)
1011,pixar,Toy Story (1995)
65,magic board game,Jumanji (1995)
67,game,Jumanji (1995)
66,Robin Williams,Jumanji (1995)
...,...,...
455,star wars,Solo: A Star Wars Story (2018)
828,remaster,Gintama: The Movie (2010)
826,comedy,Gintama: The Movie (2010)
825,anime,Gintama: The Movie (2010)


In [45]:
#Now we binarize the data
df_movies_binarized = pd.get_dummies(data = df_titles.tag)
df_movies_binarized = df_movies_binarized.set_index(df_titles.title)
df_movies_binarized = df_movies_binarized.groupby("title").max()

In [46]:
#Generate supports
fp_support = fpgrowth(df_movies_binarized, min_support=0.002, use_colnames=True)
fp_support

Unnamed: 0,support,itemsets
0,0.013359,(quirky)
1,0.004453,(romance)
2,0.004453,(intelligent)
3,0.003181,(humorous)
4,0.003181,(lawyers)
...,...,...
283,0.002545,"(philosophy, surreal)"
284,0.002545,"(philosophy, thought-provoking, surreal)"
285,0.002545,"(bittersweet, emotional)"
286,0.003181,"(stylized, dark comedy)"


In [47]:
#Generated rules
ars = association_rules(fp_support, metric = "confidence", min_threshold=.7)
ars = ars.sort_values(by = "confidence", ascending = False)
ars

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
8,(hallucinatory),(surreal),0.003817,0.013359,0.003817,1.0,74.857143,0.003766,inf
7,(existentialism),(atmospheric),0.003181,0.020356,0.003181,1.0,49.125,0.003116,inf
13,"(thought-provoking, surreal)",(philosophy),0.002545,0.003181,0.002545,1.0,314.4,0.002536,inf
12,"(philosophy, surreal)",(thought-provoking),0.002545,0.012087,0.002545,1.0,82.736842,0.002514,inf
11,"(philosophy, thought-provoking)",(surreal),0.002545,0.013359,0.002545,1.0,74.857143,0.002511,inf
1,"(atmospheric, quirky)",(thought-provoking),0.002545,0.012087,0.002545,1.0,82.736842,0.002514,inf
16,(Wizards),(Magic),0.002545,0.003181,0.002545,1.0,314.4,0.002536,inf
6,(artificial intelligence),(robots),0.003181,0.005725,0.003181,1.0,174.666667,0.003162,inf
4,"(imdb top 250, crime)",(quirky),0.002545,0.013359,0.002545,1.0,74.857143,0.002511,inf
3,"(imdb top 250, quirky)",(crime),0.002545,0.012087,0.002545,1.0,82.736842,0.002514,inf


In [48]:
#Print rules
for i in range(len(ars)): 
    antecedents = list(ars.iloc[i].antecedents)
    consequents = list(ars.iloc[i].consequents)
    #print(ars.iloc[i].consequents)
    print("{}   ->   {}".format(antecedents, consequents))

['hallucinatory']   ->   ['surreal']
['existentialism']   ->   ['atmospheric']
['thought-provoking', 'surreal']   ->   ['philosophy']
['philosophy', 'surreal']   ->   ['thought-provoking']
['philosophy', 'thought-provoking']   ->   ['surreal']
['atmospheric', 'quirky']   ->   ['thought-provoking']
['Wizards']   ->   ['Magic']
['artificial intelligence']   ->   ['robots']
['imdb top 250', 'crime']   ->   ['quirky']
['imdb top 250', 'quirky']   ->   ['crime']
['thought-provoking', 'quirky']   ->   ['atmospheric']
['quirky', 'crime']   ->   ['imdb top 250']
['philosophy']   ->   ['thought-provoking']
['philosophy']   ->   ['surreal']
['philosophy']   ->   ['thought-provoking', 'surreal']
['Magic']   ->   ['Wizards']
['atmospheric', 'thought-provoking']   ->   ['quirky']
