In [34]:
import pandas as pd
from pathlib import Path

In [35]:
file_path = './datasets/chapter_4/ratings.dat'

In [36]:
all_ratings = pd.read_csv(file_path, delimiter="::", header=None, names=['UserID', 'MovieID', 'Rating', 'Datetime'])

  all_ratings = pd.read_csv(file_path, delimiter="::", header=None, names=['UserID', 'MovieID', 'Rating', 'Datetime'])


In [37]:
all_ratings["Datetime"] = pd.to_datetime(all_ratings['Datetime'])

In [38]:
all_ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Datetime
0,1,1193,5,1970-01-01 00:00:00.978300760
1,1,661,3,1970-01-01 00:00:00.978302109
2,1,914,3,1970-01-01 00:00:00.978301968
3,1,3408,4,1970-01-01 00:00:00.978300275
4,1,2355,5,1970-01-01 00:00:00.978824291


In [39]:
all_ratings['UserID'].nunique()

6040

In [40]:
all_ratings['Favorable'] = all_ratings['Rating'] > 3

In [41]:
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]

In [42]:
favorable_ratings = ratings[ratings["Favorable"]]

In [43]:
favorable_reviews_by_users = dict((k, frozenset(v. values)) for k, v in favorable_ratings.groupby('UserID')["MovieID"])

In [44]:
num_favorable_by_movie = ratings[['MovieID', 'Favorable']].groupby('MovieID').sum()

In [45]:
num_favorable_by_movie.sort_values("Favorable", ascending=False)[:5]

Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
2858,106
2028,85
1196,83
260,80
3578,77


In [46]:
frequent_itemsets = dict()
min_support = 50
frequent_itemsets[1] = dict((frozenset((movie_id,)), row['Favorable']) for movie_id, row in num_favorable_by_movie.iterrows() if row['Favorable'] > min_support)

In [47]:
from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])



In [48]:
import sys
for k in range(2, 20):
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1], min_support)
    frequent_itemsets[k] = cur_frequent_itemsets
    if len(cur_frequent_itemsets) == 0:
        print(f"Did not find any frequent itemsets of length {k}.")
        sys.stdout.flush()
        break
    else:
        print(f"I found {len(cur_frequent_itemsets)} frequent itemsets of length {k}.")
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets


I found 268 frequent itemsets of length 2.
I found 1447 frequent itemsets of length 3.
I found 4938 frequent itemsets of length 4.
I found 11626 frequent itemsets of length 5.
