In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split

## Make recommendation

In [2]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

def mostSimilarFast(i):
    similarities = []
    users = usersPerItem[i]
    candidateItems = set()
    for u in users:
        candidateItems = candidateItems.union(itemsPerUser[u])
    for j in candidateItems:
        if j == i: continue
        sim = Jaccard(users, usersPerItem[j])
        similarities.append((sim, j))
    similarities.sort(reverse=True)
    return similarities[:10]

In [3]:
data_dir = 'clean2.csv'
data = pd.read_csv(data_dir)
data.pop('Unnamed: 0') 
data.head()

Unnamed: 0,ProductId,UserId,Score
0,B001GVISJM,A18ECVX2RJ7HUE,4
1,B001GVISJM,A2MUGFV2TDQ47K,5
2,B001GVISJM,A2A9X58G2GTBLP,5
3,B001EO5QW8,A2G7B7FKP2O2PU,5
4,B001EO5QW8,AQLL2R1PPR46X,5


In [4]:
train_data, test_data = train_test_split(data, test_size = 0.2, random_state=1)

In [5]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
users = set()
items = set()
for row in train_data.itertuples():
    prod, user = row[1], row[2]
    usersPerItem[prod].add(user)
    itemsPerUser[user].add(prod)
    users.add(user)
    items.add(prod)

## Collaborative-filtering-based rating estimation

In [6]:
global ratingMean
ratingMean = sum(train_data['Score'])/len(train_data)

In [7]:
def predictRating(prod, user):
    scores = []
    similarities = []
    for cur_prod, cur_score in reviewsPerUser[user]:
        if cur_prod == prod: continue
        scores.append(cur_score)
        similarities.append(Jaccard(usersPerItem[prod], usersPerItem[cur_prod]))
    if (sum(similarities) > 0):
        weightedScores = [(x*y) for x,y in zip(scores, similarities)]
        return sum(weightedScores) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean
    
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [8]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
for row in train_data.itertuples():
    prod, user, score = row[1], row[2], row[3]
    reviewsPerUser[user].append((prod, score))
    #reviewsPerItem[prod].append(row[3])

In [9]:
alwaysPredictMean = [ratingMean for _ in range(len(test_data))]
cfPredictions = [predictRating(test_data['ProductId'][i], test_data['UserId'][i]) for i in test_data.index]
labels = list(test_data['Score'])

In [10]:
err_baseline = MSE(alwaysPredictMean, labels)
err_CF = MSE(cfPredictions, labels)
print('For baseline =', err_baseline)
print('The MSE of rating estimation is', err_CF)

For baseline = 1.3688432313621304
The MSE of rating estimation is 0.7419863808963807


Make Recommendation

In [11]:
recommendPerUser = defaultdict(list)
i = 0
for u in users:
    for p in items:
        pred_rating = predictRating(p, u)
        if pred_rating >= 3:
            recommendPerUser[u].append(p)
    print("user" + str(i) + "complete.")
    i+=1


user767complete.
user768complete.
user769complete.
user770complete.
user771complete.
user772complete.
user773complete.
user774complete.
user775complete.
user776complete.
user777complete.
user778complete.
user779complete.
user780complete.
user781complete.
user782complete.
user783complete.
user784complete.
user785complete.
user786complete.
user787complete.
user788complete.
user789complete.
user790complete.
user791complete.
user792complete.
user793complete.
user794complete.
user795complete.
user796complete.
user797complete.
user798complete.
user799complete.
user800complete.
user801complete.
user802complete.
user803complete.
user804complete.
user805complete.
user806complete.
user807complete.
user808complete.
user809complete.
user810complete.
user811complete.
user812complete.
user813complete.
user814complete.
user815complete.
user816complete.
user817complete.
user818complete.
user819complete.
user820complete.
user821complete.
user822complete.
user823complete.
user824complete.
user825comple

In [12]:
with open("recommendations.csv", 'w') as f:
    f.write("UserId,recommendations\n")
    for key in itemsPerUser:
        f.write("%s,%s\n"%(key,itemsPerUser[key]))
        
