In [1]:
# Import libraries
import gzip
import random
import scipy
import tensorflow as tf
from collections import defaultdict
from implicit import bpr
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split

2023-11-02 22:34:24.938460: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
reader = Reader(line_format='user item rating', sep='\t')
data = Dataset.load_from_file("goodreads_fantasy.tsv", reader=reader)

In [12]:
# Standard latent-factor model
model = SVD()

In [13]:
# Inbuilt functions to split into training and test fraction
trainset, testset = train_test_split(data, test_size=.25)

In [14]:
# Fit the model and extract predictions
model.fit(trainset)
predictions = model.test(testset)

In [16]:
predictions[0].est

4.425306431009143

In [17]:
# MSE for model predictions (test set)
sse = 0
for p in predictions:
    sse += (p.r_ui - p.est) ** 2
    
print(sse / len(predictions))

1.1816900803597814


In [18]:
# Bayesian Personalized Ranking (Implicit)
def parseData(fname):
    for l in gzip.open(fname):
        d = eval(l)
        del d['review_text'] # Discard the reviews, to save memory when we don't use them
        yield d

In [20]:
# Full dataset of Goodreads fantasy reviews (fairly memory-hungry, could be replaced by something smaller)
data = list(parseData("goodreads_reviews_fantasy_paranormal.json.gz"))

In [21]:
random.shuffle(data)

In [22]:
# Example from the dataset
data[0]

{'user_id': '38c2c8bd48c2861806389ab126767f02',
 'book_id': '22852890',
 'review_id': '40ca3690f7398a2b9209cccbc7ad8b50',
 'rating': 4,
 'date_added': 'Tue Sep 02 12:30:34 -0700 2014',
 'date_updated': 'Mon Oct 12 10:50:16 -0700 2015',
 'read_at': 'Mon Oct 05 00:00:00 -0700 2015',
 'started_at': '',
 'n_votes': 0,
 'n_comments': 0}

In [23]:
# Build some utility data structures
userIDs, itemIDs = {}, {}

for d in data:
    u, i = d['user_id'], d['book_id']
    if not u in userIDs: userIDs[u] = len(userIDs) # sequential user ID
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    
nUsers, nItems = len(userIDs), len(itemIDs)

In [24]:
nUsers, nItems

(256088, 258212)

In [31]:
# Build some sparse matrix data structure. Here we essentially build the (massive!)
# user-item interaction matrix describing which items users have interacted with.
# Convert dataset to sparse matrix. Only storing positive feedback instances (i.e., rated items).
Xui = scipy.sparse.lil_matrix((nItems, nUsers))
for d in data:
    Xui[itemIDs[d['book_id']], userIDs[d['user_id']]] = 1
    
Xui_csr = scipy.sparse.csr_matrix(Xui.T)

In [32]:
# Bayesian Personalized Ranking model with 5 latent factors
# sigma(mu_u mu_i - mu_u mu_j)
model = bpr.BayesianPersonalizedRanking(factors = 5)

In [40]:
# Fit the model
model.fit(Xui_csr)

100%|██████████| 100/100 [00:26<00:00,  3.73it/s, train_auc=90.35%, skipped=1.82%]


In [41]:
recommended = model.recommend(0, Xui_csr[0])
related = model.similar_items(0)

In [42]:
related

(array([     0,  11409,  47904, 208448, 159572,  51893,  95044,  31137,
        193223, 101784], dtype=int32),
 array([0.99999994, 0.9924968 , 0.9871873 , 0.98678166, 0.9867017 ,
        0.9847294 , 0.9845206 , 0.9829978 , 0.981845  , 0.9815782 ],
       dtype=float32))

In [43]:
# Extract user and item factors
itemFactors = model.item_factors
userFactors = model.user_factors

In [44]:
itemFactors[0]

array([-0.11903579,  0.53265977, -1.5387731 ,  0.31080228, -0.38170338,
       -0.1698095 ], dtype=float32)

In [None]:
# Latent factor model (Tensorflow)
