In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from sklearn import model_selection, metrics, preprocessing

In [3]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Load the JSON input into a pandas data frame

In [4]:
import json

data = []
with open('/content/drive/MyDrive/CSE272/Movies_and_TV.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [5]:
df.columns

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'image'],
      dtype='object')

Preprocessing

In [6]:
df = df.drop(['verified', 'reviewTime', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote', 'image'], axis=1)

In [7]:
df.columns

Index(['overall', 'reviewerID', 'asin'], dtype='object')

In [8]:
# remove all users with less than 5 reviews
user_counts = df.reviewerID.value_counts()
valid_users = user_counts[user_counts >= 5].index

df = df[df.reviewerID.isin(valid_users)]

In [9]:
lbl_user = preprocessing.LabelEncoder()
lbl_product = preprocessing.LabelEncoder()
df.reviewerID = lbl_user.fit_transform(df.reviewerID.values)
df.asin = lbl_product.fit_transform(df.asin.values)

In [10]:
# define train and test data frames
grouped_df = df.groupby(df.reviewerID)

train_df = grouped_df.sample(frac=0.8, random_state=1)

test_df = df.drop(train_df.index)

In [None]:
# save data frames to save time
train_df.to_csv('/content/drive/MyDrive/CSE272/train_df.csv')
test_df.to_csv('/content/drive/MyDrive/CSE272/test_df.csv')

In [11]:
# load data frames to save time
df_train = pd.read_csv('/content/drive/MyDrive/CSE272/train_df.csv')
df_valid = pd.read_csv('/content/drive/MyDrive/CSE272/test_df.csv')
df = pd.concat([df_train,df_valid])

In [12]:
import os
from torchvision.io import read_image

class MoviesTVDataset(Dataset):
    def __init__(self, users, products, ratings):
        self.users = users
        self.products = products
        self.ratings = ratings

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        users = self.users[idx]
        products = self.products[idx]
        ratings = self.ratings[idx]

        return {
            "users": torch.tensor(users, dtype=torch.long),
            "products": torch.tensor(products, dtype=torch.long),
            "ratings": torch.tensor(ratings, dtype=torch.long),
        }

In [13]:
class MatrixFactorizationModel(nn.Module):
    def __init__(self, n_users, n_products):
        super().__init__()

        self.user_embed = nn.Embedding(n_users, 32)
        self.product_embed = nn.Embedding(n_products, 32)
        self.user_embed.weight.data.uniform_(0, 0.5)
        self.product_embed.weight.data.uniform_(0, 0.5)

    def forward(self, users, products, ratings=None):
        user_embeds = self.user_embed(users)
        product_embeds = self.product_embed(products)
        output = torch.diag(torch.mm(user_embeds,torch.transpose(product_embeds,0,1)))
        return output

In [14]:
train_dataset = MoviesTVDataset(
    users=df_train.reviewerID.values,
    products=df_train.asin.values,
    ratings=df_train.overall.values
)

valid_dataset = MoviesTVDataset(
    users=df_valid.reviewerID.values,
    products=df_valid.asin.values,
    ratings=df_valid.overall.values
)


In [15]:
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=2048,
                          shuffle=True,
                          num_workers=2) 

validation_loader = DataLoader(dataset=valid_dataset,
                          batch_size=2048,
                          shuffle=True,
                          num_workers=2) 

dataiter = iter(train_loader)
dataloader_data = next(dataiter)
print(dataloader_data)

{'users': tensor([306465, 215140, 168996,  ..., 292290,  70258, 191584]), 'products': tensor([ 94399,  55707,  98930,  ...,  92474, 133537,  32331]), 'ratings': tensor([5, 2, 1,  ..., 4, 5, 4])}


In [16]:
len(df_train)

2902609

In [17]:
from sklearn import model_selection, metrics, preprocessing

In [18]:
model = MatrixFactorizationModel(
    n_users=df.reviewerID.nunique(),
    n_products=df.asin.nunique(),
).to(device)

loss_func = nn.MSELoss()

In [19]:
def train_epocs(model, epochs=1, lr=0.01, wd=0.0):
    epochs = 1
    total_loss = 0
    plot_steps, print_steps = 5000, 5000
    step_cnt = 0
    all_losses_list = [] 
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for epoch_i in range(epochs):
         for i, train_data in enumerate(train_loader):
            output = model(train_data["users"].to(device), 
                        train_data["products"].to(device)
                        ) 
            
            rating = train_data["ratings"].view(len(output), -1).to(torch.float32)

            loss = loss_func(output, rating.to(device))
            total_loss = total_loss + loss.sum().item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            step_cnt = step_cnt + len(train_data["users"])
            

            if(step_cnt % plot_steps == 0):
                print(i)
                avg_loss = total_loss/(len(train_data["users"]) * plot_steps)
                print(f"epoch {epoch_i} loss at step: {step_cnt} is {avg_loss}")
                all_losses_list.append(avg_loss)
                total_loss = 0 # reset total_loss

In [20]:
train_epocs(model)

  return F.mse_loss(input, target, reduction=self.reduction)


624
epoch 0 loss at step: 1280000 is 0.00014159419650677593
1249
epoch 0 loss at step: 2560000 is 0.00012659903737949208


  return F.mse_loss(input, target, reduction=self.reduction)


In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

model_output_list = []
target_rating_list = []

model.eval()

with torch.no_grad():
    for i, batched_data in enumerate(validation_loader): 
        model_output = model(batched_data['users'].to(device), 
                       batched_data["products"].to(device))
        
        model_output_list.append(model_output.sum().item() / len(batched_data['users']) )


        target_rating = batched_data["ratings"]
        
        target_rating_list.append(target_rating.sum().item() / len(batched_data['users']))

        # print(f"model_output: {model_output}, target_rating: {target_rating}")


# squared If True returns MSE value, if False returns RMSE value.
rms = mean_squared_error(target_rating_list, model_output_list, squared=False)
print(f"rms: {rms}")

rms: 0.11068885851373923


In [22]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(target_rating_list, model_output_list)
print(f"mae: {mae}")

mae: 0.10611124339995893


In [26]:
from collections import defaultdict

# a dict that stores a list of predicted rating and actual rating pair for each user 
user_est_true = defaultdict(list)

with torch.no_grad():
    for i, batched_data in enumerate(validation_loader): 
        users = batched_data['users']
        products = batched_data['products']
        ratings = batched_data['ratings']
        
        model_output = model(batched_data['users'].to(device), batched_data["products"].to(device))

        for i in range(len(users)):
            user_id = users[i].item()
            product_id = products[i].item() 
            pred_rating = model_output[i].item()
            true_rating = ratings[i].item()
            
            user_est_true[user_id].append((pred_rating, true_rating))            

In [38]:
with torch.no_grad():
    precisions = dict()
    recalls = dict()

    k=1
    threshold=3.5
    lens = []
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value. 
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        lens.append(len(user_ratings))

        # get the number of actual relevant item
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # get the number of recommended item that are predicted relevent and within topk
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # get the number of recommented item that' is also actually relevant within topk
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

In [46]:
from collections import Counter
counts = Counter(lens)
print(len(lens))
print(counts)

311221
Counter({1: 168848, 2: 79519, 3: 26707, 4: 12405, 5: 6776, 6: 4163, 7: 2828, 8: 1898, 9: 1445, 10: 1071, 11: 854, 12: 633, 13: 489, 14: 439, 15: 340, 16: 279, 17: 241, 18: 196, 19: 186, 21: 153, 20: 148, 22: 128, 23: 114, 24: 96, 26: 78, 27: 77, 28: 70, 25: 67, 30: 58, 29: 52, 31: 52, 32: 37, 36: 35, 33: 34, 41: 32, 38: 29, 34: 29, 42: 26, 40: 25, 46: 24, 37: 22, 39: 22, 44: 21, 47: 21, 35: 18, 49: 16, 45: 14, 48: 13, 43: 13, 51: 12, 56: 11, 60: 11, 52: 11, 53: 11, 59: 10, 55: 10, 54: 9, 50: 9, 57: 8, 62: 8, 73: 8, 72: 7, 71: 7, 65: 7, 83: 7, 81: 6, 61: 6, 90: 6, 63: 5, 80: 5, 70: 5, 76: 5, 77: 5, 64: 4, 95: 4, 82: 4, 117: 4, 74: 4, 85: 4, 66: 4, 78: 4, 69: 4, 68: 4, 58: 4, 79: 4, 134: 3, 106: 3, 125: 3, 178: 3, 124: 3, 153: 3, 127: 3, 88: 3, 93: 3, 75: 3, 89: 3, 91: 2, 107: 2, 231: 2, 120: 2, 142: 2, 97: 2, 67: 2, 139: 2, 150: 2, 333: 2, 187: 2, 105: 2, 175: 2, 86: 2, 183: 2, 113: 2, 96: 2, 84: 2, 143: 2, 135: 2, 92: 2, 111: 2, 851: 1, 242: 1, 458: 1, 152: 1, 161: 1, 224: 1, 24

In [44]:
df_valid.reviewerID.nunique()

311221

In [36]:
# Precision and recall can then be averaged over all users
print(f"precision @ {k}: {sum(prec for prec in precisions.values()) / len(precisions)}")

print(f"recall @ {k} : {sum(rec for rec in recalls.values()) / len(recalls)}")

precision @ 1: 0.6478322478238936
recall @ 1 : 0.46834764927961425


Baseline recommendation list algorithm

In [34]:
all_users = df.reviewerID.unique()
user_subset = np.random.choice(all_users, 175, replace=False)
all_products = df.asin.unique()

with open("/content/drive/MyDrive/CSE272/recs.txt", 'w') as file:
    for num_users, user in enumerate(user_subset):
        dummy_ratings = [0] * len(all_products)
        users = [user] * len(all_products)
        recs_dataset = MoviesTVDataset(
            users=users,
            products=all_products,
            ratings=dummy_ratings
        )
        recs_loader = DataLoader(dataset=recs_dataset,
                          batch_size=2048,
                          shuffle=True,
                          num_workers=2) 
        top_batch_values = []
        top_batch_products= []
        for i, batched_data in enumerate(recs_loader): 
            model_output = model(batched_data["users"].to(device), 
                        batched_data["products"].to(device))
            indices = np.argsort(model_output.cpu().detach().numpy())[-10:]
            cur_values = sorted(model_output.cpu().detach().numpy())[-10:]
            cur_products = [all_products[i] for i in indices]
            top_batch_products.extend(cur_products)
            top_batch_values.extend(cur_values)
        top_batch_indices = np.argsort(top_batch_values)[-10:]
        top_products = [top_batch_products[i] for i in top_batch_indices]
        original_user = lbl_user.inverse_transform([user])
        original_products = lbl_product.inverse_transform(top_products)
        line = f'{original_user[0]} {original_products[0]} {original_products[1]} {original_products[2]} {original_products[3]} {original_products[4]} {original_products[5]} {original_products[6]} {original_products[7]} {original_products[8]} {original_products[9]}\n'
        file.write(line)
file.close()

KeyboardInterrupt: ignored