In [1]:
import pandas as pd
from pathlib import Path

In [2]:
file_path = './datasets/chapter_4/ratings.dat'

In [3]:
all_ratings = pd.read_csv(file_path, delimiter="::", header=None, names=['UserID', 'MovieID', 'Rating', 'Datetime'])

  all_ratings = pd.read_csv(file_path, delimiter="::", header=None, names=['UserID', 'MovieID', 'Rating', 'Datetime'])


In [4]:
all_ratings["Datetime"] = pd.to_datetime(all_ratings['Datetime'])

In [5]:
all_ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Datetime
0,1,1193,5,1970-01-01 00:00:00.978300760
1,1,661,3,1970-01-01 00:00:00.978302109
2,1,914,3,1970-01-01 00:00:00.978301968
3,1,3408,4,1970-01-01 00:00:00.978300275
4,1,2355,5,1970-01-01 00:00:00.978824291


In [6]:
all_ratings['UserID'].nunique()

6040

In [7]:
all_ratings['Favorable'] = all_ratings['Rating'] > 3

In [8]:
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]

In [9]:
favorable_ratings = ratings[ratings["Favorable"]]

In [10]:
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby('UserID')["MovieID"])

In [11]:
num_favorable_by_movie = ratings[['MovieID', 'Favorable']].groupby('MovieID').sum()

In [12]:
num_favorable_by_movie.sort_values("Favorable", ascending=False)[:5]

Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
2858,106
2028,85
1196,83
260,80
3578,77


In [13]:
frequent_itemsets = dict()
min_support = 50
frequent_itemsets[1] = dict((frozenset((movie_id,)), row['Favorable']) for movie_id, row in num_favorable_by_movie.iterrows() if row['Favorable'] > min_support)

In [14]:
from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])



In [15]:
import sys
for k in range(2, 20):
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1], min_support)
    frequent_itemsets[k] = cur_frequent_itemsets
    if len(cur_frequent_itemsets) == 0:
        print(f"Did not find any frequent itemsets of length {k}.")
        sys.stdout.flush()
        break
    else:
        print(f"I found {len(cur_frequent_itemsets)} frequent itemsets of length {k}.")
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets


I found 268 frequent itemsets of length 2.
I found 1447 frequent itemsets of length 3.
I found 4938 frequent itemsets of length 4.
I found 11626 frequent itemsets of length 5.
I found 19469 frequent itemsets of length 6.


In [None]:
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))

In [None]:
print(candidate_rules[:25])

[(frozenset(), 1), (frozenset(), 110), (frozenset(), 260), (frozenset(), 296), (frozenset(), 318), (frozenset(), 356), (frozenset(), 480), (frozenset(), 527), (frozenset(), 589), (frozenset(), 593), (frozenset(), 608), (frozenset(), 858), (frozenset(), 1097), (frozenset(), 1196), (frozenset(), 1197), (frozenset(), 1198), (frozenset(), 1210), (frozenset(), 1265), (frozenset(), 1270), (frozenset(), 1580), (frozenset(), 2028), (frozenset(), 2396), (frozenset(), 2571), (frozenset(), 2762), (frozenset(), 2858)]


In [None]:
correct_counts = defaultdict()
incorrect_counts =  defaultdict()

correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [None]:
rule_confidence = {candidate_rule: correct_counts[candidate_rule]/float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule]) for candidate_rule in candidate_rules} 

In [None]:
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)

for index in range(5):
    print(f"Rule #{index+1}")
    (premise, conclusion) = sorted_confidence[index][0]
    print(f"Rule: If a person recommends {premise} they will also recommend {conclusion}.")
    print(f"- Confidence: {rule_confidence[(premise, conclusion)]}")
    print("")

Rule #1
Rule: If a person recommends frozenset({589, 1198}) they will also recommend 1196.
- Confidence: 1.0

Rule #2
Rule: If a person recommends frozenset({2571, 1198}) they will also recommend 1196.
- Confidence: 1.0

Rule #3
Rule: If a person recommends frozenset({1097, 1210}) they will also recommend 1196.
- Confidence: 1.0

Rule #4
Rule: If a person recommends frozenset({593, 3578, 110}) they will also recommend 2571.
- Confidence: 1.0

Rule #5
Rule: If a person recommends frozenset({2571, 1198, 110}) they will also recommend 1196.
- Confidence: 1.0



In [None]:
movie_name_data = pd.read_csv(r'.\datasets\chapter_4\movies.dat', delimiter=
'::', header=None, encoding='mac-roman')

  movie_name_data = pd.read_csv(r'.\datasets\chapter_4\movies.dat', delimiter=
