In [2]:
import os
data_folder = os.path.join(os.getcwd(), "data", "movies")
ratings_filename = os.path.join(data_folder, "ratings.dat")

In [3]:
import pandas as pd
import datetime

In [4]:
all_ratings = pd.read_csv(ratings_filename, delimiter="::", header=None, names = ["UserID", "MovieID", "Rating", "Datetime"])
all_ratings["Datetime"] = pd.to_datetime(all_ratings['Datetime'],unit='s')
all_ratings[:5]

  """Entry point for launching an IPython kernel.


Unnamed: 0,UserID,MovieID,Rating,Datetime
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11


#### There are no review for most movies, such as #213

In [5]:
# Not all reviews are favourable! Our goal is "other recommended books", so we only want favourable reviews
all_ratings["Favorable"] = all_ratings["Rating"] > 3
all_ratings[10:15]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
10,1,595,5,2001-01-06 23:37:48,True
11,1,938,4,2000-12-31 22:29:12,True
12,1,2398,4,2000-12-31 22:38:01,True
13,1,2918,4,2000-12-31 22:35:24,True
14,1,1035,5,2000-12-31 22:29:13,True


In [7]:
all_ratings[all_ratings["UserID"] == 1][:5]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
0,1,1193,5,2000-12-31 22:12:40,True
1,1,661,3,2000-12-31 22:35:09,False
2,1,914,3,2000-12-31 22:32:48,False
3,1,3408,4,2000-12-31 22:04:35,True
4,1,2355,5,2001-01-06 23:38:11,True


In [8]:
# Sample the dataset. Increasing the size of the sample, increases the run-time will be considerably longer
ratings = all_ratings[all_ratings['UserID'].isin(range(170))]
ratings.shape

(23947, 5)

In [9]:
favorable_ratings = ratings[ratings["Favorable"]]
favorable_ratings[:5]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
0,1,1193,5,2000-12-31 22:12:40,True
3,1,3408,4,2000-12-31 22:04:35,True
4,1,2355,5,2001-01-06 23:38:11,True
6,1,1287,5,2000-12-31 22:33:59,True
7,1,2804,5,2000-12-31 22:11:59,True


In [10]:
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])
len(favorable_reviews_by_users)

169

In [11]:
# Find out how many movies have favourable ratings
num_favorable_by_movie = ratings[["MovieID", "Favorable"]].groupby("MovieID").sum()
num_favorable_by_movie.sort_values("Favorable", ascending=False)[:5]

Unnamed: 0_level_0,Favorable
MovieID,Unnamed: 1_level_1
2858,87.0
2028,73.0
1196,68.0
260,67.0
3578,65.0


In [12]:
from collections import defaultdict

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

In [13]:
import sys
frequent_itemsets = {}
min_support = 50

# k=1 candidates have support more than min_support favourable reviews
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favorable"])
                                for movie_id, row in num_favorable_by_movie.iterrows()
                                if row["Favorable"] > min_support)

print("There are {} movies with more than {} favorable reviews".format(len(frequent_itemsets[1]), min_support))
for k in range(2, 20):
    # Generate candidates of length k, using the frequent itemsets of length k-1
    # Only store the frequent itemsets
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],
                                                   min_support)
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        #print(cur_frequent_itemsets)
        frequent_itemsets[k] = cur_frequent_itemsets
# We aren't interested in the itemsets of length 1, so remove those
del frequent_itemsets[1]

There are 14 movies with more than 50 favorable reviews
I found 75 frequent itemsets of length 2
I found 218 frequent itemsets of length 3
I found 389 frequent itemsets of length 4
I found 436 frequent itemsets of length 5
I found 296 frequent itemsets of length 6
I found 115 frequent itemsets of length 7
I found 23 frequent itemsets of length 8
I found 2 frequent itemsets of length 9
Did not find any frequent itemsets of length 10


In [14]:
print("Found a total of {0} frequent itemsets".format(sum(len(itemsets) for itemsets in frequent_itemsets.values())))

Found a total of 1554 frequent itemsets


In [15]:
# Now we create the association rules. First, they all are candidates until the confidence has been tested
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))
print("There are {} candidate rules".format(len(candidate_rules)))

There are 7323 candidate rules


In [16]:
print(candidate_rules[:5])

[(frozenset({527}), 260), (frozenset({260}), 527), (frozenset({260}), 2762), (frozenset({2762}), 260), (frozenset({2028}), 260)]


In [17]:
#Now we compute confidence
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
              for candidate_rule in candidate_rules}

In [18]:
# Choose only rules above a minimum confidence level
min_confidence = 0.9

In [19]:
# Filter out the rules with poor confidence
rule_confidence = {rule: confidence for rule, confidence in rule_confidence.items() if confidence > min_confidence}
print(len(rule_confidence))

1624


In [20]:
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)

In [21]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise, conclusion))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: If a person recommends frozenset({2571, 1198}) they will also recommend 1196
 - Confidence: 1.000

Rule #2
Rule: If a person recommends frozenset({2571, 260}) they will also recommend 1196
 - Confidence: 1.000

Rule #3
Rule: If a person recommends frozenset({2571, 1198, 318}) they will also recommend 1196
 - Confidence: 1.000

Rule #4
Rule: If a person recommends frozenset({1210, 1198, 318}) they will also recommend 1196
 - Confidence: 1.000

Rule #5
Rule: If a person recommends frozenset({480, 2571, 1198}) they will also recommend 1196
 - Confidence: 1.000



In [22]:
# We can get the movie titles themselves from the dataset
movie_name_filename = os.path.join(data_folder, "movies.dat")
movie_name_data = pd.read_csv(movie_name_filename, delimiter="::", header=None, encoding = "mac-roman")
movie_name_data.columns = ['MovieID','Title','Genre']
# movie_name_data.columns = ["MovieID", "Title", "Release Date", "Video Release", "IMDB", "<UNK>", "Action", "Adventure",
#                            "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir",
#                            "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
movie_name_data.head()

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [24]:
def get_movie_name(movie_id):
    title_object = movie_name_data[movie_name_data["MovieID"] == movie_id]["Title"]
    title = title_object.values[0]
    return title

In [25]:
get_movie_name(4)

'Waiting to Exhale (1995)'

In [26]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: If a person recommends Matrix, The (1999), Raiders of the Lost Ark (1981) they will also recommend Star Wars: Episode V - The Empire Strikes Back (1980)
 - Confidence: 1.000

Rule #2
Rule: If a person recommends Matrix, The (1999), Star Wars: Episode IV - A New Hope (1977) they will also recommend Star Wars: Episode V - The Empire Strikes Back (1980)
 - Confidence: 1.000

Rule #3
Rule: If a person recommends Matrix, The (1999), Raiders of the Lost Ark (1981), Shawshank Redemption, The (1994) they will also recommend Star Wars: Episode V - The Empire Strikes Back (1980)
 - Confidence: 1.000

Rule #4
Rule: If a person recommends Star Wars: Episode VI - Return of the Jedi (1983), Raiders of the Lost Ark (1981), Shawshank Redemption, The (1994) they will also recommend Star Wars: Episode V - The Empire Strikes Back (1980)
 - Confidence: 1.000

Rule #5
Rule: If a person recommends Jurassic Park (1993), Matrix, The (1999), Raiders of the Lost Ark (1981) they will also recommend

In [27]:
# Evaluation using test data
test_dataset = all_ratings[~all_ratings['UserID'].isin(range(5000))]
test_favorable = test_dataset[test_dataset["Favorable"]]
#test_not_favourable = test_dataset[~test_dataset["Favourable"]]
test_favorable_by_users = dict((k, frozenset(v.values)) for k, v in test_favorable.groupby("UserID")["MovieID"])
#test_not_favourable_by_users = dict((k, frozenset(v.values)) for k, v in test_not_favourable.groupby("UserID")["MovieID"])
#test_users = test_dataset["UserID"].unique()

In [28]:
test_dataset[:5]

Unnamed: 0,UserID,MovieID,Rating,Datetime,Favorable
831451,5000,2054,4,2000-07-03 03:54:39,True
831452,5000,2058,4,2000-07-03 03:40:55,True
831453,5000,2,5,2000-07-03 03:59:41,True
831454,5000,3005,4,2000-07-03 03:40:26,True
831455,5000,7,3,2000-07-03 04:07:10,False


In [29]:
test_dataset.shape

(168758, 5)

In [30]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_favorable_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [31]:
test_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
                   for candidate_rule in rule_confidence}
print(len(test_confidence))

1624


In [32]:
sorted_test_confidence = sorted(test_confidence.items(), key=itemgetter(1), reverse=True)
print(sorted_test_confidence[:5])

[((frozenset({480, 260, 2858, 527, 1210}), 1196), 1.0), ((frozenset({480, 260, 2858, 2571, 527}), 1196), 1.0), ((frozenset({480, 2571, 1198, 527, 1210, 318}), 1196), 1.0), ((frozenset({480, 260, 2571, 1198, 527, 318}), 1196), 1.0), ((frozenset({480, 260, 2571, 1198, 527, 1210}), 1196), 1.0)]


In [33]:
for index in range(10):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_movie_name(idx) for idx in premise)
    conclusion_name = get_movie_name(conclusion)
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
    print(" - Train Confidence: {0:.3f}".format(rule_confidence.get((premise, conclusion), -1)))
    print(" - Test Confidence: {0:.3f}".format(test_confidence.get((premise, conclusion), -1)))
    print("")

Rule #1
Rule: If a person recommends Matrix, The (1999), Raiders of the Lost Ark (1981) they will also recommend Star Wars: Episode V - The Empire Strikes Back (1980)
 - Train Confidence: 1.000
 - Test Confidence: 0.782

Rule #2
Rule: If a person recommends Matrix, The (1999), Star Wars: Episode IV - A New Hope (1977) they will also recommend Star Wars: Episode V - The Empire Strikes Back (1980)
 - Train Confidence: 1.000
 - Test Confidence: 0.846

Rule #3
Rule: If a person recommends Matrix, The (1999), Raiders of the Lost Ark (1981), Shawshank Redemption, The (1994) they will also recommend Star Wars: Episode V - The Empire Strikes Back (1980)
 - Train Confidence: 1.000
 - Test Confidence: 0.847

Rule #4
Rule: If a person recommends Star Wars: Episode VI - Return of the Jedi (1983), Raiders of the Lost Ark (1981), Shawshank Redemption, The (1994) they will also recommend Star Wars: Episode V - The Empire Strikes Back (1980)
 - Train Confidence: 1.000
 - Test Confidence: 0.953

Rule #