In [1]:
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [15]:
def Jaccard(s1, s2):
    '''
    Compute the Jaccard similarity.
    :param s1: set 1                (set)
    :param s2: set 2                (set)
    :return: Jaccard coefficient    (float)
    '''
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom


def Cosine(x, y):
    '''
    Compute cosine similarity.
    :param x: vector 1              (set)
    :param y: vector 2              (set)
    :return: Cosine coefficient     (float)
    '''
    x[x == 1] = -1
    y[y == 1] = -1
    x[x == 2] = -1
    y[y == 2] = -1
    x[x >= 3] = 1
    y[y >= 3] = 1   
    return (x @ y.T) / (np.linalg.norm(x) * np.linalg.norm(y))


def Pearson(x, y):
    '''
    Compute Pearson correlation coefficient.
    :param x: vector 1              (set)
    :param y: vector 2              (set)
    :return: Pearson coeff.         (float)
    '''
    x_valid = x[x != 0]
    y_valid = y[y != 0]
    mean_x = np.mean(x_valid)
    mean_y = np.mean(y_valid)
    x[x != 0] -= mean_x
    y[y != 0] -= mean_y
    if (np.linalg.norm(x) * np.linalg.norm(y)) == 0: return 0
    else: return (x @ y.T) / (np.linalg.norm(x) * np.linalg.norm(y))

In [3]:
def load_data(file):
    '''
    Load the data.
    :param file: directory of a csv file.
    :return: 80% train_data + 20% test_data
        
    '''
    assert os.path.splitext(file)[-1] == ".csv"

    data = pd.read_csv(file)
    data.pop('Unnamed: 0')

    train_data, test_data = train_test_split(data, test_size = 0.2, random_state=1)

    return train_data, test_data

In [28]:
def predictRating(prod, user,
                  globalMean, meanRatings,
                  usersPerItem, itemsPerUser,
                  reviewsPerItem, reviewsPerUser,
                  feature_matrix, feature_matrix_T,
                  base='item', simi='Cosine'):
    '''
    Predict the rating of a user to a product.
    :param prod:                 product id (str)
    :param user:                 user id (str)
    :param globalMean:           global mean rating in the training set (float)
    :param meanRatings:          mean ratings for users (dict) 
    :param usersPerItem:         sets of all users of an item (dict)
    :param itemsPerUser:         sets of all items of a user (dict)
    :param reviewsPerItem:       reviews of each item (list)
    :param reviewsPerUser:       reviews of each user (list)
    :param feature_matrix:       rows == userID, columns == productID
    :param feature_matrix_T:     rows == productID, columns == userID
    :param base:                 type of collaborative filter ("item" or "user")
    :param simi:                 the similarity rule applied ("Jaccard" or "Cosine" or "Pearson")
    '''

    assert base == 'item' or base == 'user'
    assert simi == 'Jaccard' or simi == 'Cosine' or simi == 'Pearson'
    
    scores = []
    similarities = []

    if base == 'item':
        for cur_prod, cur_score in reviewsPerUser[user]:
            if cur_prod == prod: continue
            scores.append(cur_score)
            
            try:
                if simi == 'Jaccard':
                    similarities.append(Jaccard(usersPerItem[prod], usersPerItem[cur_prod]))
                elif simi == 'Cosine':
                    similarities.append(Cosine(np.array(feature_matrix[prod]), np.array(feature_matrix[cur_prod])))
                else:
                    similarities.append(Pearson(np.array(feature_matrix[prod]), np.array(feature_matrix[cur_prod])))
            except:
                similarities.append(0)

        if sum(similarities) != 0:
            weightedScores = [(x*y) for x,y in zip(scores, similarities)]
            return sum(weightedScores) / np.sum(np.abs(similarities))

        else: return globalMean


    else:
        users = []
        for cur_user, cur_score in reviewsPerItem[prod]:
            if cur_user == user: continue
            scores.append(cur_score)
            
            try:
                if simi == 'Jaccard':
                    similarities.append(Jaccard(itemsPerUser[user], itemsPerUser[cur_user]))
                elif simi == 'Cosine':
                    similarities.append(Cosine(np.array(feature_matrix_T[user]), np.array(feature_matrix_T[cur_user])))
                else:
                    similarities.append(Pearson(np.array(feature_matrix_T[user]), np.array(feature_matrix_T[cur_user])))
            except:
                similarities.append(0)

            users.append(cur_user)

        if sum(similarities) != 0:
            if user not in meanRatings:
                weightedScores = [(x*y) for x,y in zip(scores, similarities)]
                rating_pred = sum(weightedScores) / np.sum(np.abs(similarities))
            else:
                weightedScores = [((x-meanRatings[u])*y) if u in meanRatings else 0 for x,y,u in zip(scores, similarities, users)]
                rating_pred = meanRatings[user] + sum(weightedScores) / np.sum(np.abs(similarities))
            return rating_pred
        
        else: return globalMean

In [5]:
data_dir = 'clean2.csv'
train_data, test_data = load_data(data_dir)
labels = list(test_data['Score'])

feature_matrix = pd.pivot_table(train_data, values='Score', index=['UserId'], columns=['ProductId'])
feature_matrix.fillna(0, inplace=True)
feature_matrix_T = pd.DataFrame.transpose(feature_matrix)

usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
for row in train_data.itertuples():
    prod, user, score = row[1], row[2], row[3]
    usersPerItem[prod].add(user)
    itemsPerUser[user].add(prod)
    reviewsPerUser[user].append((prod, score))
    reviewsPerItem[prod].append((user, score))

globalMean = sum(train_data['Score'])/len(train_data)
meanRatings = {}
for i, user in enumerate(feature_matrix.columns):
    ratings = feature_matrix[user]
    ratingsValid = ratings[ratings != 0]
    meanRatings[user] = np.mean(ratingsValid)

In [35]:
alwaysPredictMean = [globalMean for _ in range(len(test_data))]
cfPredictions = []
for j in tqdm(range(len(test_data.index))):
    i = test_data.index[j]
    cfPredictions.append(predictRating(test_data['ProductId'][i], test_data['UserId'][i], 
                                       globalMean, meanRatings, 
                                       usersPerItem, itemsPerUser, 
                                       reviewsPerItem, reviewsPerUser, 
                                       feature_matrix, feature_matrix_T, 
                                       base='item', simi='Cosine'))

100%|███████████████████████████████████████████████████████████████████████████| 13603/13603 [01:07<00:00, 201.79it/s]


In [37]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

cfPredictions = np.array(cfPredictions)
cfPredictions[cfPredictions < 1] = 1
cfPredictions[cfPredictions > 5] = 5
err_baseline = MSE(alwaysPredictMean, labels)
err_CF = MSE(cfPredictions, labels)
print('For baseline =', err_baseline)
print('The MSE of rating estimation is', err_CF)

For baseline = 1.3688432313621304
The MSE of rating estimation is 0.9545808842338184
