In [1]:
import gzip 
import pandas as pd
import math 
import random 
import os
import json
from collections import defaultdict 

In [2]:
fp = os.path.join('data', 'goodreads_reviews_comics_graphic.json')
with open(fp) as file:
    data = file.readlines()
    data = list(map(json.loads, data))

In [3]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
ratingDict = {}
for i in range(0, len(data)):
    user = data[i]['user_id']
    item = data[i]['book_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    ratingDict[(user,item)] = data[i]['rating']

In [4]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

In [5]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [6]:
def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        sim = Jaccard(users, usersPerItem[i2])
        #sim = Pearson(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:10]

In [7]:
#QUESTION 1
query = data[0]['book_id']
ms = mostSimilar(query, 10)
ms

[(0.16666666666666666, '25334626'),
 (0.14285714285714285, '25659811'),
 (0.13793103448275862, '18369278'),
 (0.13157894736842105, '18430205'),
 (0.12903225806451613, '20299669'),
 (0.125, '17995154'),
 (0.12121212121212122, '23241671'),
 (0.12121212121212122, '23093378'),
 (0.12121212121212122, '18853527'),
 (0.11764705882352941, '26778333')]

In [8]:
#QUESTION 2 PART A
results_indices = []
for i in range(0, len(data)):
    if data[i]['user_id'] == data[0]['user_id']:
        results_indices.append(i)

results_dict = {}
for i in range(0, len(results_indices)):
    results_dict[i] = data[results_indices[i]]['rating']
    
sorted_dict = sorted(results_dict.items(), key = lambda x:x[1], reverse = True)
highest_rated_index = sorted_dict[0][0]

query = data[highest_rated_index]['book_id']
ms = mostSimilar(query, 10)
ms

[(0.16666666666666666, '25334626'),
 (0.14285714285714285, '25659811'),
 (0.13793103448275862, '18369278'),
 (0.13157894736842105, '18430205'),
 (0.12903225806451613, '20299669'),
 (0.125, '17995154'),
 (0.12121212121212122, '23241671'),
 (0.12121212121212122, '23093378'),
 (0.12121212121212122, '18853527'),
 (0.11764705882352941, '26778333')]

In [9]:
#QUESTION 2 PART B
df = pd.DataFrame(data)
df = df[['user_id','book_id','rating']]

def mostSimilarUsers(j, N):
    similarities = []
    itemsReturned = []
    items = itemsPerUser[j]
    for j2 in itemsPerUser:
        if j2 == j: continue
        sim = Jaccard(items, itemsPerUser[j2])
        #sim = Pearson(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim,j2))
    similarities.sort(reverse=True)
    for i in similarities:
        filtered = df[df['user_id'] == i[1]]
        filtered = filtered.sort_values(by = 'rating', ascending = False).reset_index()
        highest_rating = filtered.iloc[0]['rating']
        filtered = filtered[filtered['rating'] == highest_rating]
        if filtered.shape[0] == 1:
            book_retrieved = filtered.iloc[filtered.shape[0]-1]['book_id']
        else:
            book_retrieved = filtered['book_id'].min()
        if j in usersPerItem[book_retrieved]:
            continue
        elif len(itemsReturned) < N:
            itemsReturned.append((i[0], book_retrieved))
        else:
            break
    return itemsReturned

query = data[0]['user_id']
ms = mostSimilarUsers(query, 10)
ms

[(0.3333333333333333, '10767466'),
 (0.25, '17570797'),
 (0.2, '15704307'),
 (0.14285714285714285, '10138607'),
 (0.05555555555555555, '12434747'),
 (0.030303030303030304, '17995248'),
 (0.023809523809523808, '10105459'),
 (0.02040816326530612, '10997645'),
 (0.014925373134328358, '10361139'),
 (0.0136986301369863, '10264328')]

In [10]:
#QUESTION 3 PART A
def Pearson(i1, i2):
    # Between two items
    iBar1 = itemAverages[i1]
    iBar2 = itemAverages[i2]
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    denom = 0
    for u in inter:
        numer += (ratingDict[(u,i1)] - iBar1)*(ratingDict[(u,i2)] - iBar2)
    for u in inter: #usersPerItem[i1]:
        denom1 += (ratingDict[(u,i1)] - iBar1)**2
    #for u in usersPerItem[i2]:
        denom2 += (ratingDict[(u,i2)] - iBar2)**2
        denom += denom1 * denom2
    denom = (denom) **(1/2)
    if denom == 0: return 0
    return numer / denom

def nsimilarPearson(i, N):
    similarities = []
    #users = usersPerItem[i]
    for i2 in usersPerItem:
        if i2 == i: continue
        #sim = J(users, usersPerItem[i2])
        sim = Pearson(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:N]

query = data[0]['book_id']
ms = nsimilarPearson(query, 10)
ms

[(1.0000000000000002, '95402'),
 (1.0000000000000002, '9517'),
 (1.0000000000000002, '9516'),
 (1.0000000000000002, '9408670'),
 (1.0000000000000002, '8891233'),
 (1.0000000000000002, '8714027'),
 (1.0000000000000002, '838933'),
 (1.0000000000000002, '790192'),
 (1.0000000000000002, '72114'),
 (1.0000000000000002, '6779672')]

In [11]:
#QUESTION 3 PART B
def Pearson2(i1, i2):
    # Between two items
    iBar1 = itemAverages[i1]
    iBar2 = itemAverages[i2]
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += (ratingDict[(u,i1)] - iBar1)*(ratingDict[(u,i2)] - iBar2)
    for u in usersPerItem[i1]:
        denom1 += (ratingDict[(u,i1)] - iBar1)**2
    for u in usersPerItem[i2]:
        denom2 += (ratingDict[(u,i2)] - iBar2)**2
    denom = (denom1 * denom2) **(1/2)
    if denom == 0: return 0
    return numer / denom

def nsimilarPearson2(i, N):
    similarities = []
    for i2 in usersPerItem:
        if i2 == i: continue
        #sim = J(users, usersPerItem[i2])
        sim = Pearson2(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:N]

query = data[0]['book_id']
ms = nsimilarPearson2(query, 10)
ms

[(0.31898549007874194, '20300526'),
 (0.1878586543136926, '13280885'),
 (0.17896391275176457, '18208501'),
 (0.16269036695641687, '25430791'),
 (0.16269036695641687, '21521612'),
 (0.1555075595594449, '1341758'),
 (0.1526351566298752, '6314737'),
 (0.1520488804816035, '4009034'),
 (0.1494406444160154, '988744'),
 (0.14632419481281994, '18430205')]

In [13]:
#QUESTION 4 
ratingMean = df['rating'].mean()
def predictRating(user,item):
    ratings = []
    similarities = []
    for i2 in itemsPerUser[user]:
        if i2 == item: continue
        ratings.append(ratingDict[(user, i2)]-itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean
    
predictions = []
for i in range(0, 10000):
    predictions.append(predictRating(data[i]['user_id'], data[i]['book_id']))
    
MSE = sum((df['rating'][0:10000]-predictions)**2)/10000
MSE

0.7123545753840687

In [14]:
#QUESTION 5 PART A
def Cosine(i1, i2):
    # Between two items
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += ratingDict[(u,i1)]*ratingDict[(u,i2)]
    for u in usersPerItem[i1]:
        denom1 += ratingDict[(u,i1)]**2
    for u in usersPerItem[i2]:
        denom2 += ratingDict[(u,i2)]**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

def predictRatingcosine(user,item):
    ratings = []
    similarities = []
    for i2 in itemsPerUser[user]:
        if i2 == item: continue
        ratings.append(ratingDict[(user, i2)]-itemAverages[i2])
        similarities.append(Cosine(item, i2))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean

predictions = []
for i in range(0, 10000):
    predictions.append(predictRatingcosine(data[i]['user_id'], data[i]['book_id']))
    
MSE = sum((df['rating'][0:10000]-predictions)**2)/10000
MSE

0.7107374206362199

In [31]:
#QUESTION 5 PART B PEARSON1
def predictRatingpearson1(user,item):
    ratings = []
    similarities = []
    for i2 in itemsPerUser[user]:
        if i2 == item: continue
        ratings.append(ratingDict[(user, i2)]-itemAverages[i2])
        similarities.append(Pearson(item, i2))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean
    
predictions = []
for i in range(0, 10000):
    predictions.append(predictRatingpearson1(data[i]['user_id'], data[i]['book_id']))
    
MSE = sum((df['rating'][0:10000]-predictions)**2)/10000
MSE

3.90067629086688e+29

In [15]:
#Q5 Part B Pearson2
def predictRatingpearson2(user,item):
    ratings = []
    similarities = []
    for i2 in itemsPerUser[user]:
        if i2 == item: continue
        ratings.append(ratingDict[(user, i2)]-itemAverages[i2])
        similarities.append(Pearson2(item, i2))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean


predictions = []
for i in range(0, 10000):
    predictions.append(predictRatingpearson2(data[i]['user_id'], data[i]['book_id']))

MSE = sum((df['rating'][0:10000]-predictions)**2)/10000
MSE

9602.52533754106