In [536]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## 1. Data Processing
Data Reading and cleaning and merging

### 1.1 Data Read

In [537]:
data_movies = pd.read_csv('movies.csv')
data_ratings = pd.read_csv('ratings.csv')

In [538]:
data_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [539]:
data_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [540]:
data_movies.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [541]:
data_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [542]:
# data_movies.movieId.value_counts(),print('Number of duplicated unique ids are: ',data_movies.movieId.duplicated().sum())

In [543]:
# data_ratings.movieId.value_counts()

### 1.2 Merge

In [544]:
merge = data_movies.merge(data_ratings,on = 'movieId',how = 'inner')

In [545]:
merge

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [546]:
merge.drop(columns=['title','genres', 'timestamp'],inplace=True)

In [547]:
merge

Unnamed: 0,movieId,userId,rating
0,1,1,4.0
1,1,5,4.0
2,1,7,4.5
3,1,15,2.5
4,1,17,4.5
...,...,...,...
100831,193581,184,4.0
100832,193583,184,3.5
100833,193585,184,3.5
100834,193587,184,3.5


In [548]:
cols = ['userId', 'movieId', 'rating']

In [549]:
merge = merge[cols]

In [550]:
merge.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,5,1,4.0
2,7,1,4.5
3,15,1,2.5
4,17,1,4.5


In [551]:
merge.describe()

Unnamed: 0,userId,movieId,rating
count,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557
std,182.618491,35530.987199,1.042529
min,1.0,1.0,0.5
25%,177.0,1199.0,3.0
50%,325.0,2991.0,3.5
75%,477.0,8122.0,4.0
max,610.0,193609.0,5.0


In [552]:
# len(merge.userId.unique()) 

In [553]:
# merge_list = merge.groupby(by = ["userId"])["title"].apply(list).reset_index()

In [554]:
# merge_list.head()

### 1.3 reaname dataframe cols

In [555]:
merge = merge.rename(columns={'userId': 'UserID', 'movieId':'MovieID', 'rating':'Rating'})

In [556]:
merge.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,1,4.0
1,5,1,4.0
2,7,1,4.5
3,15,1,2.5
4,17,1,4.5


### 1.4 Data Visualization

In [620]:
import sweetviz as sviz
import datetime

In [625]:
analyze_report = sviz.analyze(merge)
now = datetime.datetime.now()
f_name = f'datasetviz/dataset-1_{now.year}_{now.month}_{now.day}_{now.hour}_{now.minute}_{now.second}.html'
analyze_report.show_html(f_name, open_browser=False)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=4.0), HTML(value='')), la…


Report datasetviz/dataset1_2021_8_2_3_6_16.html was generated.


### 1.4 Data Selection - Define hyperparameters
such as min_movie_rating, no_of_data_row

In [557]:

## Min Movie Rating range [0, 5]
min_movie_rating = 3

## No_of_data_row == userID
no_of_data_row = 30

In [558]:
all_ratings = merge.copy(deep=True)

In [559]:
all_ratings.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,1,4.0
1,5,1,4.0
2,7,1,4.5
3,15,1,2.5
4,17,1,4.5


In [560]:
## Choosing >3 ratings only, as 'Favorable Ratings'
all_ratings["Favorable"] = all_ratings["Rating"] > min_movie_rating

In [561]:
## for how many users? This is a point to modify dataset size

ratings = all_ratings[all_ratings['UserID'].isin(range(no_of_data_row))]

In [562]:
favorable_ratings = ratings[ratings["Favorable"]]

In [563]:
print("all_ratings len: ", len(all_ratings))
print("ratings len: ", len(ratings))
print("favorable_ratings len: ", len(favorable_ratings))

all_ratings len:  100836
ratings len:  4845
favorable_ratings len:  2695


### 1.5 All Movies Group by User -- Itemset
movie ids grouped by userid. this is the itemset

In [564]:
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])

In [565]:
len(favorable_reviews_by_users)

29

In [566]:
favorable_reviews_by_users

{1: frozenset({1,
            3,
            6,
            47,
            50,
            101,
            110,
            151,
            157,
            163,
            216,
            231,
            235,
            260,
            333,
            349,
            356,
            362,
            367,
            441,
            457,
            480,
            527,
            543,
            552,
            553,
            590,
            592,
            593,
            596,
            608,
            661,
            733,
            804,
            919,
            923,
            940,
            943,
            954,
            1023,
            1024,
            1025,
            1029,
            1031,
            1032,
            1042,
            1049,
            1060,
            1073,
            1080,
            1089,
            1090,
            1092,
            1097,
            1127,
            1136,
            1196,
            1197,


In [567]:
data_list = [list(v.values) for k, v in favorable_ratings.groupby("UserID")["MovieID"] ]

In [568]:
data_tuple = [tuple(v.values) for k, v in favorable_ratings.groupby("UserID")["MovieID"] ]

In [569]:
print(data_list)
print(data_tuple)

[[1, 3, 6, 47, 50, 101, 110, 151, 157, 163, 216, 231, 235, 260, 333, 349, 356, 362, 367, 441, 457, 480, 527, 543, 552, 553, 590, 592, 593, 596, 608, 661, 733, 804, 919, 923, 940, 943, 954, 1023, 1024, 1025, 1029, 1031, 1032, 1042, 1049, 1060, 1073, 1080, 1089, 1090, 1092, 1097, 1127, 1136, 1196, 1197, 1198, 1206, 1208, 1210, 1213, 1214, 1220, 1222, 1224, 1226, 1240, 1256, 1265, 1270, 1275, 1278, 1282, 1291, 1298, 1348, 1473, 1500, 1517, 1552, 1573, 1587, 1617, 1620, 1625, 1732, 1777, 1793, 1804, 1805, 1920, 1927, 1954, 1967, 2000, 2005, 2012, 2018, 2028, 2033, 2046, 2048, 2054, 2058, 2078, 2090, 2094, 2096, 2099, 2105, 2115, 2116, 2137, 2139, 2141, 2143, 2161, 2174, 2193, 2268, 2273, 2291, 2329, 2353, 2366, 2387, 2395, 2406, 2427, 2450, 2459, 2470, 2478, 2492, 2502, 2529, 2542, 2571, 2580, 2596, 2616, 2628, 2640, 2641, 2644, 2648, 2654, 2692, 2700, 2716, 2761, 2797, 2826, 2858, 2872, 2899, 2916, 2944, 2947, 2948, 2949, 2959, 2985, 2987, 2991, 2993, 2997, 3033, 3034, 3052, 3053, 3062, 3

### 1.6 Getting Movie Names from MovieID

In [591]:
movie_name_filename = 'movies.csv'
movie_name_data = pd.read_csv(movie_name_filename, encoding = "mac-roman")
movie_name_data.drop(columns=['genres'], inplace=True)
movie_name_data.rename(columns={'movieId':'MovieID', 'title': 'Title'}, inplace=True)


In [592]:
movie_name_data.head()

Unnamed: 0,MovieID,Title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [593]:
def get_movie_name(movie_id):
    title_object = movie_name_data[movie_name_data["MovieID"] == movie_id]["Title"]
    title = title_object.values[0]
    return title

In [594]:
get_movie_name(4)

'Waiting to Exhale (1995)'

---
---

## 2. Association Implementation

### 2.1 Define Hyperparameter for Support and Confidence

In [570]:
## Min Support range [0.0, 1.0]
min_support_ratio = 0.2

## Confidence range [0.0, 1.0]
min_confidence_ratio = 0.5

---
---

### 2.2 MLXTEND -- for time & checking purpose

In [571]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(data_list).transform(data_list)
mlxtnd_df = pd.DataFrame(te_ary, columns=te.columns_)

In [572]:
te_ary

array([[ True, False,  True, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False,  True, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [573]:
from mlxtend.frequent_patterns import apriori

%timeit -n 10 -r 10 apriori(mlxtnd_df, min_support=min_support_ratio)

13.3 ms ± 675 µs per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [574]:
ap = apriori(mlxtnd_df, min_support=min_support_ratio)
ap

Unnamed: 0,support,itemsets
0,0.241379,(0)
1,0.241379,(4)
2,0.241379,(18)
3,0.206897,(19)
4,0.344828,(26)
...,...,...
457,0.206897,"(128, 325, 313, 368, 89, 155)"
458,0.206897,"(128, 325, 313, 368, 89, 315)"
459,0.206897,"(128, 325, 313, 368, 89, 347)"
460,0.206897,"(128, 578, 325, 313, 89, 347)"


In [575]:
from mlxtend.frequent_patterns import fpgrowth

%timeit -n 10 -r 10 fpgrowth(mlxtnd_df, min_support=min_support_ratio)

15.4 ms ± 3.15 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [576]:
fp = fpgrowth(mlxtnd_df, min_support=min_support_ratio)
fp

Unnamed: 0,support,itemsets
0,0.551724,(128)
1,0.448276,(315)
2,0.413793,(728)
3,0.379310,(578)
4,0.379310,(185)
...,...,...
457,0.206897,"(105, 230)"
458,0.206897,"(578, 171)"
459,0.206897,"(128, 171)"
460,0.206897,"(104, 171)"


---
---

## Apriori 

In [598]:
# data_tuple = [tuple(v) for k, v in favorable_ratings.groupby("UserID")["MovieID"] ]

In [602]:
data_tuple=[]
for k, vs in favorable_ratings.groupby("UserID")["MovieID"]:
    l = []
    for v in vs:
        l.append(get_movie_name(v))
    data_tuple.append(tuple(l))

In [603]:
data_tuple

[('Toy Story (1995)',
  'Grumpier Old Men (1995)',
  'Heat (1995)',
  'Seven (a.k.a. Se7en) (1995)',
  'Usual Suspects, The (1995)',
  'Bottle Rocket (1996)',
  'Braveheart (1995)',
  'Rob Roy (1995)',
  'Canadian Bacon (1995)',
  'Desperado (1995)',
  'Billy Madison (1995)',
  'Dumb & Dumber (Dumb and Dumber) (1994)',
  'Ed Wood (1994)',
  'Star Wars: Episode IV - A New Hope (1977)',
  'Tommy Boy (1995)',
  'Clear and Present Danger (1994)',
  'Forrest Gump (1994)',
  'Jungle Book, The (1994)',
  'Mask, The (1994)',
  'Dazed and Confused (1993)',
  'Fugitive, The (1993)',
  'Jurassic Park (1993)',
  "Schindler's List (1993)",
  'So I Married an Axe Murderer (1993)',
  'Three Musketeers, The (1993)',
  'Tombstone (1993)',
  'Dances with Wolves (1990)',
  'Batman (1989)',
  'Silence of the Lambs, The (1991)',
  'Pinocchio (1940)',
  'Fargo (1996)',
  'James and the Giant Peach (1996)',
  'Rock, The (1996)',
  "She's the One (1996)",
  'Wizard of Oz, The (1939)',
  'Citizen Kane (1941)',

In [604]:
from apriori.apriori import *
from apriori.itemsets import *
from apriori.rules import *

In [605]:
item, rule = apriori(data_tuple, min_support=min_support_ratio, min_confidence=min_confidence_ratio)

In [606]:
item

{1: {('Dances with Wolves (1990)',): 9,
  ('Full Metal Jacket (1987)',): 7,
  ('Groundhog Day (1993)',): 7,
  ('Fargo (1996)',): 6,
  ('Star Wars: Episode V - The Empire Strikes Back (1980)',): 10,
  ('Star Wars: Episode IV - A New Hope (1977)',): 11,
  ('Apocalypse Now (1979)',): 6,
  ('Terminator, The (1984)',): 8,
  ('Back to the Future (1985)',): 10,
  ("Schindler's List (1993)",): 8,
  ('Goodfellas (1990)',): 7,
  ('Usual Suspects, The (1995)',): 11,
  ('Indiana Jones and the Last Crusade (1989)',): 6,
  ('Princess Bride, The (1987)',): 7,
  ('Jurassic Park (1993)',): 9,
  ('Gladiator (2000)',): 12,
  ('Toy Story (1995)',): 7,
  ('Fight Club (1999)',): 7,
  ('Fugitive, The (1993)',): 9,
  ('Alien (1979)',): 7,
  ('Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',): 13,
  ('X-Men (2000)',): 8,
  ('Matrix, The (1999)',): 11,
  ('Forrest Gump (1994)',): 16,
  ('Braveheart (1995)',): 8,
  ('Silence of the Lambs, The (1991)',): 11,
  ('Seven (a.k.a. Se7en

In [607]:
rule

[{Matrix, The (1999)} -> {Alien (1979)},
 {Alien (1979)} -> {Matrix, The (1999)},
 {American Beauty (1999)} -> {Forrest Gump (1994)},
 {Matrix, The (1999)} -> {American Beauty (1999)},
 {American Beauty (1999)} -> {Matrix, The (1999)},
 {Star Wars: Episode IV - A New Hope (1977)} -> {American Beauty (1999)},
 {American Beauty (1999)} -> {Star Wars: Episode IV - A New Hope (1977)},
 {Star Wars: Episode V - The Empire Strikes Back (1980)} -> {American Beauty (1999)},
 {American Beauty (1999)} -> {Star Wars: Episode V - The Empire Strikes Back (1980)},
 {Terminator, The (1984)} -> {American Beauty (1999)},
 {American Beauty (1999)} -> {Terminator, The (1984)},
 {Usual Suspects, The (1995)} -> {Apocalypse Now (1979)},
 {Apocalypse Now (1979)} -> {Usual Suspects, The (1995)},
 {Forrest Gump (1994)} -> {Back to the Future (1985)},
 {Back to the Future (1985)} -> {Forrest Gump (1994)},
 {Jurassic Park (1993)} -> {Back to the Future (1985)},
 {Back to the Future (1985)} -> {Jurassic Park (1993

In [583]:
ap_pat_len = 0
for i in range(1,len(item)+1):
    ap_pat_len += len(item[i])
    
print("Apriori number of association patterns:", ap_pat_len)
ap_rul_len = len(rule)
print("Apriori number of rules:", ap_rul_len)

Apriori number of association patterns: 462
Apriori number of rules: 3091


---
---

## FP Growth

In [608]:
from fpgrowth.fpgrowth import *
from fpgrowth.utils import *

In [586]:
# data_list = [list(v.values) for k, v in favorable_ratings.groupby("UserID")["MovieID"] ]

In [609]:
data_list=[]
for k, vs in favorable_ratings.groupby("UserID")["MovieID"]:
    l = []
    for v in vs:
        l.append(get_movie_name(v))
    data_list.append(list(l))

In [610]:
freqItemSet, rules = fpgrowth(data_list, minSupRatio=min_support_ratio, minConf=min_confidence_ratio)

In [611]:
print(freqItemSet)

[{'Fargo (1996)'}, {'Monty Python and the Holy Grail (1975)'}, {'Goodfellas (1990)', 'Monty Python and the Holy Grail (1975)'}, {'Apocalypse Now (1979)'}, {'Apocalypse Now (1979)', 'Usual Suspects, The (1995)'}, {'Indiana Jones and the Last Crusade (1989)'}, {'Indiana Jones and the Last Crusade (1989)', 'Star Wars: Episode V - The Empire Strikes Back (1980)'}, {'Indiana Jones and the Last Crusade (1989)', 'Star Wars: Episode IV - A New Hope (1977)'}, {'Indiana Jones and the Last Crusade (1989)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Star Wars: Episode IV - A New Hope (1977)'}, {'L.A. Confidential (1997)'}, {'Good Will Hunting (1997)'}, {'Collateral (2004)'}, {'Sixth Sense, The (1999)'}, {'Babe (1995)'}, {'Die Hard: With a Vengeance (1995)'}, {'Monsters, Inc. (2001)'}, {'Catch Me If You Can (2002)'}, {'Catch Me If You Can (2002)', 'Minority Report (2002)'}, {'Casino Royale (2006)'}, {'Casino Royale (2006)', 'Forrest Gump (1994)'}, {'Up (2009)'}, {'Titanic (1997)'}, {

In [612]:
print(rules) 

[[{'Goodfellas (1990)'}, {'Monty Python and the Holy Grail (1975)'}, 0.8571428571428571], [{'Monty Python and the Holy Grail (1975)'}, {'Goodfellas (1990)'}, 1.0], [{'Apocalypse Now (1979)'}, {'Usual Suspects, The (1995)'}, 1.0], [{'Usual Suspects, The (1995)'}, {'Apocalypse Now (1979)'}, 0.5454545454545454], [{'Indiana Jones and the Last Crusade (1989)'}, {'Star Wars: Episode V - The Empire Strikes Back (1980)'}, 1.0], [{'Star Wars: Episode V - The Empire Strikes Back (1980)'}, {'Indiana Jones and the Last Crusade (1989)'}, 0.6], [{'Indiana Jones and the Last Crusade (1989)'}, {'Star Wars: Episode IV - A New Hope (1977)'}, 1.0], [{'Star Wars: Episode IV - A New Hope (1977)'}, {'Indiana Jones and the Last Crusade (1989)'}, 0.5454545454545454], [{'Indiana Jones and the Last Crusade (1989)'}, {'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Star Wars: Episode IV - A New Hope (1977)'}, 1.0], [{'Star Wars: Episode V - The Empire Strikes Back (1980)'}, {'Indiana Jones and the Last

In [614]:
if(freqItemSet):
    print("FP Growth number of Freq Itemset:" , len(freqItemSet))
if(rules):
    print("FP Growth number of rules:" , len(rules))

FP Growth number of Freq Itemset: 462
FP Growth number of rules: 3069


---
---