In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split

## Make recommendation

In [2]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

def mostSimilarFast(i):
    similarities = []
    users = usersPerItem[i]
    candidateItems = set()
    for u in users:
        candidateItems = candidateItems.union(itemsPerUser[u])
    for j in candidateItems:
        if j == i: continue
        sim = Jaccard(users, usersPerItem[j])
        similarities.append((sim, j))
    similarities.sort(reverse=True)
    return similarities[:10]

In [3]:
data_dir = 'clean2.csv'
data = pd.read_csv(data_dir)
data.pop('Unnamed: 0') 
data.head()

Unnamed: 0,ProductId,UserId,Score
0,B001GVISJM,A18ECVX2RJ7HUE,4
1,B001GVISJM,A2MUGFV2TDQ47K,5
2,B001GVISJM,A2A9X58G2GTBLP,5
3,B001EO5QW8,A2G7B7FKP2O2PU,5
4,B001EO5QW8,AQLL2R1PPR46X,5


In [4]:
train_data, test_data = train_test_split(data, test_size = 0.2, random_state=1)

In [5]:
usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
for row in train_data.itertuples():
    prod, user = row[1], row[2]
    usersPerItem[prod].add(user)
    itemsPerUser[user].add(prod)

## Collaborative-filtering-based rating estimation

In [13]:
global ratingMean
ratingMean = sum(train_data['Score'])/len(train_data)

In [18]:
def predictRating(prod, user):
    scores = []
    similarities = []
    for cur_prod, cur_score in reviewsPerUser[user]:
        if cur_prod == prod: continue
        scores.append(cur_score)
        similarities.append(Jaccard(usersPerItem[prod], usersPerItem[cur_prod]))
    if (sum(similarities) > 0):
        weightedScores = [(x*y) for x,y in zip(scores, similarities)]
        return sum(weightedScores) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean
    
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [15]:
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
for row in train_data.itertuples():
    prod, user, score = row[1], row[2], row[3]
    reviewsPerUser[user].append((prod, score))
    #reviewsPerItem[prod].append(row[3])

In [23]:
alwaysPredictMean = [ratingMean for _ in range(len(test_data))]
cfPredictions = [predictRating(test_data['ProductId'][i], test_data['UserId'][i]) for i in test_data.index]
labels = list(test_data['Score'])

In [24]:
err_baseline = MSE(alwaysPredictMean, labels)
err_CF = MSE(cfPredictions, labels)
print('For baseline =', err_baseline)
print('The MSE of rating estimation is', err_CF)

For baseline = 1.3688432313621304
The MSE of rating estimation is 0.7419863808963807


In [26]:
A = np.array([[5,3], [3,4], [4,5], [5,2]])
B = np.array([[1, 0.4, 0.5, 0.2], [0, 0.6, 0.5, 0.8]])
A @ B

array([[5. , 3.8, 4. , 3.4],
       [3. , 3.6, 3.5, 3.8],
       [4. , 4.6, 4.5, 4.8],
       [5. , 3.2, 3.5, 2.6]])