In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.spatial import distance
from scipy.sparse import csr_matrix
from collections import defaultdict
import pickle
np.random.seed(0)

# **LEAVE-ONE-OUT**

In [3]:
train_lou_df = pd.read_csv('/content/drive/MyDrive/Priyanka/train_book_reviews_leave_one_out.csv')
test_lou_df = pd.read_csv('/content/drive/MyDrive/Priyanka/test_book_reviews_leave_one_out.csv')

In [4]:
print("TRAIN LOADED")
print(train_lou_df)
print("\n\n")
print("TEST LOADED")
print(test_lou_df)

TRAIN LOADED
               item            user  rating   timestamp
0        1101986964  A12NFV3VBMDV4A     5.0  1472169600
1        1537245856  A13KSU7ZWRL04J     5.0  1472169600
2        194642000X  A32SJS0TTSRIM5     5.0  1472169600
3        194642000X  A3KLPA5D8SX7VR     5.0  1472169600
4        194642000X  A39L46WM6GBO4Q     4.0  1472169600
...             ...             ...     ...         ...
1056899  0440000785  A2P3LKAINU8JUE     4.0  1536796800
1056900  1720013306  A28GRFK2F3TMWE     4.0  1536883200
1056901  0525483705  A2QHX2UQPXXEN6     3.0  1536969600
1056902  1732624704  A1MXW3BGAZW44D     5.0  1537056000
1056903  1732624704  A1CXTYWIRDPOED     5.0  1537315200

[1056904 rows x 4 columns]



TEST LOADED
             item            user  rating   timestamp
0      1494567296  A2NMKI0ZPIE3VU     5.0  1474848000
1      1941480020  A32382F71YPPQ7     5.0  1475020800
2      B001MQA3DU  A161OJ95SYSN7A     5.0  1475020800
3      1523744812  A13VRP9X8VWM06     3.0  1475193600
4 

In [5]:
train_lou_df['rating'] = train_lou_df['rating'] / 5.0
test_lou_df['rating'] = test_lou_df['rating'] / 5.0

In [6]:
print("TRAIN after normalizing ratings")
print(train_lou_df)
print("\n\n")
print("TEST after normalizing ratings")
print(test_lou_df)

TRAIN after normalizing ratings
               item            user  rating   timestamp
0        1101986964  A12NFV3VBMDV4A     1.0  1472169600
1        1537245856  A13KSU7ZWRL04J     1.0  1472169600
2        194642000X  A32SJS0TTSRIM5     1.0  1472169600
3        194642000X  A3KLPA5D8SX7VR     1.0  1472169600
4        194642000X  A39L46WM6GBO4Q     0.8  1472169600
...             ...             ...     ...         ...
1056899  0440000785  A2P3LKAINU8JUE     0.8  1536796800
1056900  1720013306  A28GRFK2F3TMWE     0.8  1536883200
1056901  0525483705  A2QHX2UQPXXEN6     0.6  1536969600
1056902  1732624704  A1MXW3BGAZW44D     1.0  1537056000
1056903  1732624704  A1CXTYWIRDPOED     1.0  1537315200

[1056904 rows x 4 columns]



TEST after normalizing ratings
             item            user  rating   timestamp
0      1494567296  A2NMKI0ZPIE3VU     1.0  1474848000
1      1941480020  A32382F71YPPQ7     1.0  1475020800
2      B001MQA3DU  A161OJ95SYSN7A     1.0  1475020800
3      1523744812 

In [7]:
### Using train data construct user_item_ratings and item_user_ratings dictionaries
user_item_ratings = {}
item_user_ratings = {}

for row in train_lou_df.itertuples():

    user_id, item_id, rating = row.user, row.item, row.rating

    if user_id not in user_item_ratings:
      user_item_ratings[user_id] = {}
    if item_id not in item_user_ratings:
      item_user_ratings[item_id] = {}

    user_item_ratings[user_id][item_id] = rating
    item_user_ratings[item_id][user_id] = rating

In [8]:
item_index_to_item = {item_index: item_id for item_index, item_id in enumerate(item_user_ratings.keys())}

In [None]:
def get_rmse_error(sample_datafrme, U, V):

  total_sample_items = len(sample_datafrme)
  rmse_val = 0.0
  sum_of_squares = 0.0

  for row in sample_datafrme.itertuples():
    user_id, item_id, actual_rating = row.user, row.item, row.rating
    pred_rating = np.dot(U[user_id], V[item_id])
    sum_of_squares += (actual_rating - pred_rating) ** 2

  rmse_val = np.sqrt(sum_of_squares / total_sample_items)

  return rmse_val

In [None]:
def gradient_descent_update(U,V,K):
  mu = 0.001
  lambda_value = 0.001
  for user in U.keys():
    updates = np.zeros(K)
    for item in user_item_ratings[user].keys():
      pred = np.inner(U[user], V[item])
      error = user_item_ratings[user][item] - pred
      updates += error * V[item]
    final_updates = 2*mu*updates - 2*lambda_value*U[user]
    U[user] += final_updates

  for item in V.keys():
    updates = np.zeros(K)
    for user in item_user_ratings[item].keys():
      pred = np.inner(U[user],V[item])
      error = item_user_ratings[item][user] - pred
      updates += error*U[user]
    final_updates = 2*mu*updates - 2*lambda_value*V[item]
    V[item] += final_updates

  return U,V

In [None]:
def matrix_factorization (training_data, K=10, epochs=100):
  U, V = defaultdict(np.array), defaultdict(np.array)

  max_value = np.sqrt(training_data['rating'].mean() / K)

  U = {user_id: np.random.uniform(low=0.0, high=max_value, size=K) for user_id in user_item_ratings.keys()}
  V = {item_id: np.random.uniform(low=0.0, high=max_value, size=K) for item_id in item_user_ratings.keys()}

  for epoch in range(epochs):
    U, V = gradient_descent_update(U, V, K)
  return U,V

In [None]:
k = 64
epochs = 200

(W, H) = matrix_factorization(train_lou_df, k, epochs)
print(get_rmse_error(train_lou_df, W, H))

0.12267543567231005


In [None]:
with open('/content/drive/MyDrive/Priyanka/matrix_factorization_user_emb.pkl', 'wb') as f:
    pickle.dump(W, f)

with open('/content/drive/MyDrive/Priyanka/matrix_factorization_item_emb.pkl', 'wb') as f:
    pickle.dump(H, f)

In [9]:
# reading the saved embeddings
with open('/content/drive/MyDrive/Priyanka/matrix_factorization_user_emb.pkl', 'rb') as f:
    W = pickle.load(f)

with open('/content/drive/MyDrive/Priyanka/matrix_factorization_item_emb.pkl', 'rb') as f:
    H = pickle.load(f)

In [10]:
def get_pred_and_top_k(user_id, item_id, k):

  pred_ratings_for_user = np.asarray([np.dot(W[user_id], H[item]) for item in item_user_ratings.keys()])
  pred_rating_item = np.dot(W[user_id], H[item_id])

  k_curr = 0
  top_k_rec_items = []
  pred_items_user_sorted = np.argsort(-pred_ratings_for_user)

  for item_index in pred_items_user_sorted:
    item = item_index_to_item[item_index]
    if k_curr == k:
      break
    if item in user_item_ratings[user_id]:
      continue
    else:
      top_k_rec_items.append(item)
      k_curr += 1

  top_k_rec_items = np.array(top_k_rec_items)

  return (pred_rating_item, top_k_rec_items)

In [17]:
def compute_RMSE_and_HIT_at_k(sample_datafrme):
  total_sample_items = len(sample_datafrme)
  rmse_val = 0.0
  sum_of_squares = 0.0

  hit_at_k = 0

  for row in sample_datafrme.itertuples():
    user_id, item_id, actual_rating = row.user, row.item, row.rating
    if item_id in item_user_ratings.keys():
      pred_rating, top_k_rec_items = get_pred_and_top_k(user_id, item_id, 50)
      if ((actual_rating > 0.6) and (item_id in top_k_rec_items)) or ((actual_rating <= 0.6) and (item_id not in top_k_rec_items)):
        hit_at_k += 1
    else:
      pred_rating = 0.0
      if actual_rating <= 0.6:
        hit_at_k += 1
    sum_of_squares += (actual_rating - pred_rating) ** 2

  rmse_val = np.sqrt(sum_of_squares / total_sample_items)
  hit_at_k = hit_at_k / total_sample_items

  return rmse_val, hit_at_k

In [18]:
test_RMSE, test_hit_at_k = compute_RMSE_and_HIT_at_k(test_lou_df)

In [19]:
print("RMSE: ", test_RMSE)
print("HIT @ 50: ", test_hit_at_k)

RMSE:  0.1687581192542109
HIT @ 50:  0.10084509945377718
