## Import of the packages is needed

In [1]:
import numpy as np
import pandas as pd
from zipfile import ZipFile, Path
from collections import defaultdict
from sklearn.model_selection import GroupShuffleSplit
from operator import itemgetter
import sys

# Importing the data

In [2]:
# specifying the zip file name
file_name = "Archive.zip"
  
# opening the zip file in READ mode
with ZipFile(file_name, 'r') as zip:
    # printing all the contents of the zip file
    zip.printdir()
  
    # extracting all the files
    print('Extracting all the files now...')
    zip.extractall()
    print('Done!')

File Name                                             Modified             Size
Books.csv                                      2019-09-07 23:10:34     23471894
Ratings.csv                                    2019-09-07 23:10:36     22633877
Users.csv                                      2019-09-07 23:10:38      2465187
Extracting all the files now...
Done!


In [3]:
# pass in the specific file name 
# to the open method
with ZipFile("Archive.zip") as myzip:
    data = myzip.open("Ratings.csv")

#Now, we can read in the data
df = pd.read_csv(data, sep=';')
df.head()

Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


## Preparing the data

In [4]:
df2 = df[(df['Rating'] != 0)]
df = df2.copy()
df['Positive'] = df["Rating"] > 4 # positive ratings

## Create Train and test data

In [5]:
train_inds, test_inds = next(GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 44).split(df, groups=df['User-ID']))
train = df.iloc[train_inds]
test = df.iloc[test_inds]

# Writes to a CSV file
train.to_csv('book_train.csv') 
test.to_csv('book_test.csv')

In [6]:
top_ratings_train = pd.read_csv('book_train.csv')
fav_rating = top_ratings_train[top_ratings_train["Positive"]]; fav_rating

Unnamed: 0.1,Unnamed: 0,User-ID,ISBN,Rating,Positive
0,1,276726,0155061224,5,True
2,4,276729,0521795028,6,True
3,6,276736,3257224281,8,True
4,7,276737,0600570967,6,True
5,8,276744,038550120X,7,True
...,...,...,...,...,...
345707,1149771,276704,0743211383,7,True
345708,1149773,276704,0806917695,5,True
345709,1149775,276704,1563526298,9,True
345710,1149777,276709,0515107662,10,True


In [7]:
# the books which each user has given a favorable review
postive_reviews_by_users = dict((k, frozenset(v.values))
                               for k, v in fav_rating.groupby("User-ID")["ISBN"]); postive_reviews_by_users

{9: frozenset({'0452264464'}),
 10: frozenset({'8477024456'}),
 12: frozenset({'1879384493'}),
 14: frozenset({'0061076031', '0439095026', '0689821166'}),
 16: frozenset({'0345402871'}),
 17: frozenset({'0425099148', '0553264990', '0891075275'}),
 19: frozenset({'0375759778'}),
 22: frozenset({'3404921038'}),
 26: frozenset({'0446310786', '0449005615'}),
 32: frozenset({'0060168013'}),
 39: frozenset({'0553582909', '0671888587'}),
 42: frozenset({'0553582747'}),
 44: frozenset({'0440223571'}),
 53: frozenset({'0156047624', '0245542957', '0380715899'}),
 56: frozenset({'0671623249', '0679865691'}),
 67: frozenset({'0394743741'}),
 69: frozenset({'0617683993', '1853260053'}),
 70: frozenset({'1414035004'}),
 73: frozenset({'0060938412'}),
 75: frozenset({'1558531025'}),
 81: frozenset({'0375410538'}),
 82: frozenset({'0966986105'}),
 83: frozenset({'087113375X'}),
 85: frozenset({'0340767936'}),
 87: frozenset({'0071416331', '0375509038'}),
 91: frozenset({'0316769487'}),
 92: frozenset(

In [8]:
# create a DataFrame that tells us how frequently each book has been given a favorable review
num_positive_by_book = top_ratings_train[["ISBN", "Positive"]].groupby('ISBN').sum(); num_positive_by_book
# Let's see the top five movies list
num_positive_by_book.sort_values("Positive", ascending = False)[:5]

Unnamed: 0_level_0,Positive
ISBN,Unnamed: 1_level_1
0316666343,546
0385504209,363
0312195516,293
0679781587,263
059035342X,251


## The Apriori Algorthim

In [9]:
frequent_itemsets = {}
min_support = 50 # define the minimum support needed for an itemset to be considered frequent
frequent_itemsets[1] = dict((frozenset((ISBN,)), row["Positive"])
                  for ISBN, row in num_positive_by_book.iterrows()
                  if row["Positive"] > min_support)
print(frequent_itemsets)

{1: {frozenset({'002542730X'}): 58, frozenset({'0060175400'}): 62, frozenset({'0060391626'}): 62, frozenset({'0060392452'}): 112, frozenset({'0060502258'}): 132, frozenset({'0060915544'}): 92, frozenset({'0060921145'}): 56, frozenset({'0060928336'}): 249, frozenset({'0060929871'}): 53, frozenset({'0060930535'}): 160, frozenset({'0060934417'}): 128, frozenset({'0060938455'}): 123, frozenset({'0060959037'}): 115, frozenset({'0060976845'}): 143, frozenset({'0060977493'}): 66, frozenset({'0060987103'}): 117, frozenset({'0060987529'}): 83, frozenset({'0060987561'}): 68, frozenset({'0061009059'}): 101, frozenset({'006101351X'}): 100, frozenset({'0061015725'}): 56, frozenset({'0061097101'}): 69, frozenset({'0061097314'}): 58, frozenset({'0062502182'}): 84, frozenset({'0064400557'}): 51, frozenset({'0064407667'}): 75, frozenset({'0066214122'}): 70, frozenset({'0070212570'}): 75, frozenset({'0099771519'}): 68, frozenset({'0140067477'}): 81, frozenset({'014025448X'}): 65, frozenset({'014028009X'

In [10]:
def find_frequent_itemsets(postive_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    # iterate over all of the users and their reviews
    for user, reviews in postive_reviews_by_users.items(): 
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_book in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_book,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])

In [11]:
for k in range(2, 20):
    cur_frequent_itemsets = find_frequent_itemsets(postive_reviews_by_users, frequent_itemsets[k-1], min_support)
    frequent_itemsets[k] = cur_frequent_itemsets
    # break out the preceding loop if we didn't find any new frequent itemsets
    if len (cur_frequent_itemsets) == 0:
        print("Didn't find any frequent itemset length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        sys.stdout.flush()
# these are itemsets of length one, we need at least two items - Let's delete them     
del frequent_itemsets[1]

I found 52 frequent itemsets of length 2
I found 22 frequent itemsets of length 3
I found 6 frequent itemsets of length 4
I found 1 frequent itemsets of length 5
Didn't find any frequent itemset length 6


In [12]:
frequent_itemsets[5].items()

dict_items([(frozenset({'0439139597', '0439136350', '0590353403', '043935806X', '0439064864'}), 110)])

## Mining association rules

In [13]:
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))
    print(candidate_rules[:5])

[(frozenset({'0439064864'}), '0439136350'), (frozenset({'0439136350'}), '0439064864'), (frozenset({'0439064864'}), '0590353403'), (frozenset({'0590353403'}), '0439064864'), (frozenset({'0439064864'}), '0439139597')]
[(frozenset({'0439064864'}), '0439136350'), (frozenset({'0439136350'}), '0439064864'), (frozenset({'0439064864'}), '0590353403'), (frozenset({'0590353403'}), '0439064864'), (frozenset({'0439064864'}), '0439139597')]
[(frozenset({'0439064864'}), '0439136350'), (frozenset({'0439136350'}), '0439064864'), (frozenset({'0439064864'}), '0590353403'), (frozenset({'0590353403'}), '0439064864'), (frozenset({'0439064864'}), '0439139597')]
[(frozenset({'0439064864'}), '0439136350'), (frozenset({'0439136350'}), '0439064864'), (frozenset({'0439064864'}), '0590353403'), (frozenset({'0590353403'}), '0439064864'), (frozenset({'0439064864'}), '0439139597')]
[(frozenset({'0439064864'}), '0439136350'), (frozenset({'0439136350'}), '0439064864'), (frozenset({'0439064864'}), '0590353403'), (froze

In [14]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)

for user, reviews in postive_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
                
rule_confidence = {candidate_rule: correct_counts[candidate_rule]
                   /float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
                  for candidate_rule in candidate_rules}

## Get book names

In [15]:
from operator import itemgetter
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)
for index in range(20):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print("Rule: if a person recommend {0} they will also recommend {1}".format(premise, conclusion))
    print(" - Confidence:{0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: if a person recommend frozenset({'0590353403', '043935806X'}) they will also recommend 0439064864
 - Confidence:0.970

Rule #2
Rule: if a person recommend frozenset({'0590353403', '043935806X', '0439136350'}) they will also recommend 0439064864
 - Confidence:0.968

Rule #3
Rule: if a person recommend frozenset({'043935806X', '0439139597', '0439064864'}) they will also recommend 0439136350
 - Confidence:0.964

Rule #4
Rule: if a person recommend frozenset({'0590353403', '043935806X', '0439139597'}) they will also recommend 0439064864
 - Confidence:0.958

Rule #5
Rule: if a person recommend frozenset({'0590353403', '043935806X', '0439139597'}) they will also recommend 0439136350
 - Confidence:0.958

Rule #6
Rule: if a person recommend frozenset({'0590353403', '0439139597'}) they will also recommend 0439064864
 - Confidence:0.957

Rule #7
Rule: if a person recommend frozenset({'0590353403', '043935806X', '0439139597', '0439064864'}) they will also recommend 0439136350
 - Con

In [16]:
def get_book_name(book_id):
    title_object =  df_books[df_books['ISBN'] == book_id] ["Title"]
    title = title_object.values[0]
    return title

In [17]:
# pass in the specific file name 
# to the open method
with ZipFile("Archive.zip") as myzip:
    data = myzip.open("Books.csv")

#Now, we can read in the data
df_books = pd.read_csv(data, sep=';')

In [18]:
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)
for index in range(10):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_book_name(idx) for idx in premise)
    conclusion_name = get_book_name(conclusion)
    print("Rule: if a person recommend {0} they will also recommend {1}".format(premise_names, conclusion_name))
    print(" - Confidence:{0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

Rule #1
Rule: if a person recommend Harry Potter and the Sorcerer's Stone (Book 1), Harry Potter and the Order of the Phoenix (Book 5) they will also recommend Harry Potter and the Chamber of Secrets (Book 2)
 - Confidence:0.970

Rule #2
Rule: if a person recommend Harry Potter and the Sorcerer's Stone (Book 1), Harry Potter and the Order of the Phoenix (Book 5), Harry Potter and the Prisoner of Azkaban (Book 3) they will also recommend Harry Potter and the Chamber of Secrets (Book 2)
 - Confidence:0.968

Rule #3
Rule: if a person recommend Harry Potter and the Order of the Phoenix (Book 5), Harry Potter and the Goblet of Fire (Book 4), Harry Potter and the Chamber of Secrets (Book 2) they will also recommend Harry Potter and the Prisoner of Azkaban (Book 3)
 - Confidence:0.964

Rule #4
Rule: if a person recommend Harry Potter and the Sorcerer's Stone (Book 1), Harry Potter and the Order of the Phoenix (Book 5), Harry Potter and the Goblet of Fire (Book 4) they will also recommend Harr

## Test Dataset

In [19]:
test_dataset = pd.read_csv('book_test.csv')

In [20]:
# Dataset of only favourable reviews
test_positive = test_dataset[test_dataset["Positive"]]
test_positive_by_users = dict((k, frozenset(v.values))
                                  for k, v in test_positive.groupby("User-ID")["ISBN"])

In [21]:
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in test_positive_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1

In [22]:
test_confidence = {candidate_rule: correct_counts[candidate_rule] 
                   / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
                  for candidate_rule in rule_confidence}

In [23]:
for index in range(10):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    premise_names = ", ".join(get_book_name(idx) for idx in premise)
    conclusion_name = get_book_name(conclusion)
    print("Rule: if a person recommends {0} they will also recommend {1}".format(premise_names, conclusion_name))
    print(" - Train Confidence:{0:.3f}".format(rule_confidence.get((premise, conclusion), -1)))
    print(" - Test Confidence:{0:.3f}".format(test_confidence.get((premise, conclusion), -1)))
    print("")

Rule #1
Rule: if a person recommends Harry Potter and the Sorcerer's Stone (Book 1), Harry Potter and the Order of the Phoenix (Book 5) they will also recommend Harry Potter and the Chamber of Secrets (Book 2)
 - Train Confidence:0.970
 - Test Confidence:1.000

Rule #2
Rule: if a person recommends Harry Potter and the Sorcerer's Stone (Book 1), Harry Potter and the Order of the Phoenix (Book 5), Harry Potter and the Prisoner of Azkaban (Book 3) they will also recommend Harry Potter and the Chamber of Secrets (Book 2)
 - Train Confidence:0.968
 - Test Confidence:1.000

Rule #3
Rule: if a person recommends Harry Potter and the Order of the Phoenix (Book 5), Harry Potter and the Goblet of Fire (Book 4), Harry Potter and the Chamber of Secrets (Book 2) they will also recommend Harry Potter and the Prisoner of Azkaban (Book 3)
 - Train Confidence:0.964
 - Test Confidence:0.857

Rule #4
Rule: if a person recommends Harry Potter and the Sorcerer's Stone (Book 1), Harry Potter and the Order of