In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.spatial import distance
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# **LEAVE-ONE-OUT**

In [3]:
train_lou_df = pd.read_csv('/content/drive/MyDrive/Priyanka/train_book_reviews_leave_one_out.csv')
test_lou_df = pd.read_csv('/content/drive/MyDrive/Priyanka/test_book_reviews_leave_one_out.csv')

In [4]:
print("TRAIN after normalizing ratings")
print(train_lou_df)
print("\n\n")
print("TEST after normalizing ratings")
print(test_lou_df)

TRAIN after normalizing ratings
               item            user  rating   timestamp
0        1101986964  A12NFV3VBMDV4A     5.0  1472169600
1        1537245856  A13KSU7ZWRL04J     5.0  1472169600
2        194642000X  A32SJS0TTSRIM5     5.0  1472169600
3        194642000X  A3KLPA5D8SX7VR     5.0  1472169600
4        194642000X  A39L46WM6GBO4Q     4.0  1472169600
...             ...             ...     ...         ...
1056899  0440000785  A2P3LKAINU8JUE     4.0  1536796800
1056900  1720013306  A28GRFK2F3TMWE     4.0  1536883200
1056901  0525483705  A2QHX2UQPXXEN6     3.0  1536969600
1056902  1732624704  A1MXW3BGAZW44D     5.0  1537056000
1056903  1732624704  A1CXTYWIRDPOED     5.0  1537315200

[1056904 rows x 4 columns]



TEST after normalizing ratings
             item            user  rating   timestamp
0      1494567296  A2NMKI0ZPIE3VU     5.0  1474848000
1      1941480020  A32382F71YPPQ7     5.0  1475020800
2      B001MQA3DU  A161OJ95SYSN7A     5.0  1475020800
3      1523744812 

In [5]:
train_lou_df['rating'] = train_lou_df['rating'] / 5.0
test_lou_df['rating'] = test_lou_df['rating'] / 5.0

In [6]:
print("TRAIN after normalizing ratings")
print(train_lou_df)
print("\n\n")
print("TEST after normalizing ratings")
print(test_lou_df)

TRAIN after normalizing ratings
               item            user  rating   timestamp
0        1101986964  A12NFV3VBMDV4A     1.0  1472169600
1        1537245856  A13KSU7ZWRL04J     1.0  1472169600
2        194642000X  A32SJS0TTSRIM5     1.0  1472169600
3        194642000X  A3KLPA5D8SX7VR     1.0  1472169600
4        194642000X  A39L46WM6GBO4Q     0.8  1472169600
...             ...             ...     ...         ...
1056899  0440000785  A2P3LKAINU8JUE     0.8  1536796800
1056900  1720013306  A28GRFK2F3TMWE     0.8  1536883200
1056901  0525483705  A2QHX2UQPXXEN6     0.6  1536969600
1056902  1732624704  A1MXW3BGAZW44D     1.0  1537056000
1056903  1732624704  A1CXTYWIRDPOED     1.0  1537315200

[1056904 rows x 4 columns]



TEST after normalizing ratings
             item            user  rating   timestamp
0      1494567296  A2NMKI0ZPIE3VU     1.0  1474848000
1      1941480020  A32382F71YPPQ7     1.0  1475020800
2      B001MQA3DU  A161OJ95SYSN7A     1.0  1475020800
3      1523744812 

In [7]:
# Find unique user IDs and unique item IDs
users = train_lou_df['user'].unique()
items = train_lou_df['item'].unique()

user_to_index = {user: index for index, user in enumerate(users)}
item_to_index = {item: index for index, item in enumerate(items)}

# Create dictionaries to store the user-item ratings
user_item_ratings = {}
for user_id in users:
    user_item_ratings[user_id] = {}

for row in train_lou_df.itertuples():
    user_id, item_id, rating = row.user, row.item, row.rating
    user_item_ratings[user_id][item_id] = rating

In [8]:
item_embeddings = {}
for item_id in items:
  item_embeddings[item_id] = np.array([user_item_ratings[user].get(item_id, 0) for user in users])

item_embeddings = csr_matrix(list(item_embeddings.values()))
item_embeddings

<18028x19406 sparse matrix of type '<class 'numpy.float64'>'
	with 1051590 stored elements in Compressed Sparse Row format>

In [9]:
item_similarity_scores = cosine_similarity(item_embeddings)
item_similarity_scores = csr_matrix(np.abs(item_similarity_scores))
item_similarity_scores

<18028x18028 sparse matrix of type '<class 'numpy.float64'>'
	with 52395906 stored elements in Compressed Sparse Row format>

# PREDICTION

In [10]:
def get_top_k(user_index, item_index, k):
  user_given_ratings = item_embeddings[:,user_index]
  numerator = item_similarity_scores.dot(user_given_ratings)

  user_given_ratings[user_given_ratings != 0] = 1
  denominator = item_similarity_scores.dot(user_given_ratings)

  pred_ratings_for_user = numerator / denominator
  pred_ratings_for_user = np.nan_to_num(pred_ratings_for_user, nan=0)
  pred_rating_item = np.asarray(pred_ratings_for_user)[item_index][0]

  k_curr = 0
  top_k_rec_items = []
  pred_items_user_sorted = np.asarray(np.argsort(-pred_ratings_for_user, axis=0))
  for item in pred_items_user_sorted:
    if k_curr == k:
      break
    if user_given_ratings[item[0]] != 0:
      continue
    else:
      top_k_rec_items.append(item[0])
      k_curr += 1
  top_k_rec_items = np.array(top_k_rec_items)

  return (pred_rating_item, top_k_rec_items)

In [11]:
total_test_items = len(test_lou_df)

test_RMSE = 0.0
sum_of_squares = 0.0

hit_at_k = 0

for row in test_lou_df.itertuples():
  user_id, item_id, actual_rating = row.user, row.item, row.rating
  if item_id in items:
    pred_rating, top_k_rec_items = get_top_k(user_to_index[user_id], item_to_index[item_id], 10)
    if ((actual_rating > 0.6) and (item_to_index[item_id] in top_k_rec_items)) or ((actual_rating <= 0.6) and (item_to_index[item_id] not in top_k_rec_items)):
      hit_at_k += 1
  else:
    pred_rating = 0.0
    if actual_rating <= 0.6:
      hit_at_k += 1
  sum_of_squares += (actual_rating - pred_rating) ** 2

test_RMSE = np.sqrt(sum_of_squares / total_test_items)
hit_at_k = hit_at_k / total_test_items

In [12]:
print("RMSE: ", test_RMSE)
print("HIT @ 10: ", hit_at_k)

RMSE:  0.1611099491786153
HIT @ 10:  0.09723796763887457


In [13]:
len(test_lou_df[test_lou_df['rating'] <= 0.6])

1883