In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.spatial import distance
from scipy.sparse import csr_matrix
from collections import defaultdict
import torch
import torch.nn as nn
import math
import pickle
np.random.seed(0)

# **LEAVE-ONE-OUT**

In [3]:
training_data = pd.read_csv('/content/drive/MyDrive/Priyanka/train_book_reviews_leave_one_out.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Priyanka/test_book_reviews_leave_one_out.csv')

training_data['rating'] = training_data['rating'] / 5.0
test_data['rating'] = test_data['rating'] / 5.0

In [4]:
training_data

Unnamed: 0,item,user,rating,timestamp
0,1101986964,A12NFV3VBMDV4A,1.0,1472169600
1,1537245856,A13KSU7ZWRL04J,1.0,1472169600
2,194642000X,A32SJS0TTSRIM5,1.0,1472169600
3,194642000X,A3KLPA5D8SX7VR,1.0,1472169600
4,194642000X,A39L46WM6GBO4Q,0.8,1472169600
...,...,...,...,...
1056899,0440000785,A2P3LKAINU8JUE,0.8,1536796800
1056900,1720013306,A28GRFK2F3TMWE,0.8,1536883200
1056901,0525483705,A2QHX2UQPXXEN6,0.6,1536969600
1056902,1732624704,A1MXW3BGAZW44D,1.0,1537056000


In [5]:
test_data

Unnamed: 0,item,user,rating,timestamp
0,1494567296,A2NMKI0ZPIE3VU,1.0,1474848000
1,1941480020,A32382F71YPPQ7,1.0,1475020800
2,B001MQA3DU,A161OJ95SYSN7A,1.0,1475020800
3,1523744812,A13VRP9X8VWM06,0.6,1475193600
4,0399176772,A2J057MQRHLWS4,0.4,1475452800
...,...,...,...,...
19401,B01FKT9TW0,A3O0898T7NA7GB,1.0,1537747200
19402,B01ENU0V5Q,AMI929LFF76I6,1.0,1537747200
19403,B01GLVJ0GQ,A20JZM5K4E54LO,1.0,1537920000
19404,B01HI9W5HQ,A1MCN1E5GNFNXJ,0.2,1538006400


In [6]:
with open("/content/drive/MyDrive/Priyanka/matrix_factorization_user_emb.pkl", "rb") as f:
  user_emb = pickle.load(f)

with open("/content/drive/MyDrive/Priyanka/matrix_factorization_item_emb.pkl", "rb") as f:
  item_emb = pickle.load(f)

In [7]:
user_item_ratings = {}
for row in training_data.itertuples():
    user_id, item_id, rating = row.user, row.item, row.rating
    if user_id not in user_item_ratings:
      user_item_ratings[user_id] = {}
    user_item_ratings[user_id][item_id] = rating

In [8]:
item_index_to_item = {item_index: item_id for item_index, item_id in enumerate(item_emb.keys())}

In [9]:
class MLP(nn.Module):
    def __init__(self):
        torch.manual_seed(0)
        np.random.seed(0)
        super(MLP, self).__init__()

        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
        self.relu = nn.ReLU()

    def forward(self, input_emb):
        out1 = self.relu(self.fc1(input_emb))
        out2 = self.relu(self.fc2(out1))
        output = self.relu(self.output(out2))

        return output

In [10]:
mlpmodel = MLP()
print(mlpmodel)

MLP(
  (fc1): Linear(in_features=128, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (output): Linear(in_features=32, out_features=1, bias=True)
  (relu): ReLU()
)


In [11]:
def MLP_train(training_data, user_emb, item_emb, mlpmodel, epochs=80):
  torch.manual_seed(0)
  np.random.seed(0)

  input_emb = []
  for row in training_data.itertuples():
        user_emb_u = user_emb[row.user]
        item_emb_i = item_emb[row.item]
        input_emb.append(np.concatenate((user_emb_u, item_emb_i)))

  input_emb = torch.FloatTensor(np.array(input_emb))
  criterion = nn.MSELoss()
  optimizer = torch.optim.Adam(mlpmodel.parameters())

  for epoch in range(epochs):
    optimizer.zero_grad()
    output_flattened = mlpmodel.forward(input_emb).flatten()
    ground_truth = torch.FloatTensor([float(row.rating) for row in training_data.itertuples()])
    loss = criterion(output_flattened, ground_truth)
    loss.backward()
    optimizer.step()

    differences = np.array((output_flattened - ground_truth).detach())

    if epoch % 10 == 0:
      rmse = np.sqrt(np.mean(differences ** 2))
      print(f"Epoch {epoch} RMSE: {rmse:.6f}")

  return mlpmodel

In [12]:
epochs = 100
trained_MLP_model = MLP_train(training_data, user_emb, item_emb, mlpmodel, epochs)

Epoch 0 RMSE: 0.839609
Epoch 10 RMSE: 0.640015
Epoch 20 RMSE: 0.369590
Epoch 30 RMSE: 0.146076
Epoch 40 RMSE: 0.178078
Epoch 50 RMSE: 0.139446
Epoch 60 RMSE: 0.138644
Epoch 70 RMSE: 0.133970
Epoch 80 RMSE: 0.132674
Epoch 90 RMSE: 0.132646


In [13]:
def test_RMSE_of_MLP(test_data, user_emb, item_emb, trained_MLP_model):
  new_item_emb = np.mean(np.asarray(list(item_emb.values())), axis=0)

  test_input = []

  for row in test_data.itertuples():
    user_emb_of_u = user_emb[row.user]
    if row.item not in item_emb:
      item_emb_of_i = new_item_emb
    else:
      item_emb_of_i = item_emb[row.item]
    test_input.append(np.concatenate((user_emb_of_u, item_emb_of_i)))

  test_input = torch.FloatTensor(test_input)

  pred_result_flattened = trained_MLP_model.forward(test_input).flatten()
  ground_truth = torch.FloatTensor(np.array([float(row.rating) for row in test_data.itertuples()]))
  differences = np.array((pred_result_flattened - ground_truth).detach())
  test_RMSE = np.sqrt(np.mean(differences ** 2))

  return test_RMSE

In [14]:
def get_sorted_k(user_id, item_emb, trained_MLP_model, k):
  pred_ratings_for_user = []
  for item in item_emb:
    test_input = torch.FloatTensor(np.array(np.concatenate((user_emb[user_id], item_emb[item]))))
    pred_ratings_for_user.append(float(trained_MLP_model.forward(test_input).flatten().detach()))

  top_k_rec_items = []
  pred_items_user_sorted = np.argsort(-np.array(pred_ratings_for_user))
  k_curr = 0
  for item_index in pred_items_user_sorted:
    item = item_index_to_item[item_index]
    if k_curr == k:
      break
    if item in user_item_ratings[user_id]:
      continue
    else:
      top_k_rec_items.append(item)
      k_curr += 1
  top_k_rec_items = np.array(top_k_rec_items)
  return top_k_rec_items

def get_hit_at_k(test_data, user_emb, item_emb, trained_MLP_model, k=10):
  hits = 0
  total = len(test_data)

  for row in test_data.itertuples():
    user_id, item_id, actual_rating = row.user, row.item, row.rating
    if item_id not in item_emb:
      if actual_rating <= 0.6:
        hits += 1
    else:
      top_k_rec_items = (user_id, item_emb, trained_MLP_model, k)
      if ((actual_rating > 0.6) and (item_id in top_k_rec_items)) or ((actual_rating <= 0.6) and (item_id not in top_k_rec_items)):
        hits += 1

  return (hits / total)

In [15]:
test_rmse_mlp = test_RMSE_of_MLP(test_data, user_emb, item_emb, trained_MLP_model)
test_hit_at_k = get_hit_at_k(test_data, user_emb, item_emb, trained_MLP_model, 50)

  test_input = torch.FloatTensor(test_input)


In [16]:
print("RMSE: ", test_rmse_mlp)
print("HIT @ 50: ", test_hit_at_k)

RMSE:  0.14362729
HIT @ 50:  0.09703184582088015
