# Reference
https://github.com/ethen8181/machine-learning/blob/master/recsys/4_bpr.ipynb

https://towardsdatascience.com/recommender-system-using-bayesian-personalized-ranking-d30e98bba0b9

In [0]:
import sys
import numpy as np
import pandas as pd
from math import ceil
from tqdm import trange
from subprocess import call
from itertools import islice
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix, dok_matrix
import json

In [0]:
with open("Musical_Instruments.json") as f:
  review_json_data = [json.loads(line) for line in f]

In [0]:
AppliancesDf = pd.DataFrame(review_json_data)

In [0]:
subset_ratings=AppliancesDf[['asin','reviewTime','reviewerID','verified','overall','reviewText']]

In [7]:
subset_ratings=subset_ratings[subset_ratings.verified==True]
subset_ratings['reviewDate']=pd.to_datetime(subset_ratings['reviewTime'])
subset_ratings.head()

Unnamed: 0,asin,reviewTime,reviewerID,verified,overall,reviewText,reviewDate
1,470536454,"04 6, 2017",A29OWR79AM796H,True,4.0,Very helpful...,2017-04-06
2,470536454,"03 14, 2017",AUPWU27A7X5F6,True,5.0,EASY TO UNDERSTAND AND A PROMPT SERVICE TOO,2017-03-14
3,470536454,"02 14, 2017",A1N69A47D4JO6K,True,4.0,My girlfriend use quite often,2017-02-14
4,470536454,"01 29, 2017",AHTIQUMVCGBFJ,True,5.0,Arrived as described. Very happy.,2017-01-29
5,470536454,"01 4, 2017",A1J8LQ7HVLR9GU,True,5.0,Love the Dummies Series. Never fails.,2017-01-04


In [0]:
subset_ratings=subset_ratings[subset_ratings.reviewDate>='2018-01-01']
subset_ratings=subset_ratings.drop_duplicates(subset=['asin','reviewerID','reviewDate'],keep='last')
subset_ratings=subset_ratings[['reviewerID','asin','overall']]
subset_ratings.shape

In [11]:
no_of_products_reviewed=subset_ratings.asin.value_counts()
no_of_products_reviewed.describe([.95,.97,.99])

count    26104.000000
mean         4.778501
std         16.379560
min          1.000000
50%          1.000000
95%         16.000000
97%         24.000000
99%         60.000000
max        482.000000
Name: asin, dtype: float64

In [12]:
review_per_user_counts=subset_ratings.reviewerID.value_counts()
review_per_user_counts.describe([.95,.97,.99]) # unique user count is 300070

count    94918.000000
mean         1.314166
std          1.038811
min          1.000000
50%          1.000000
95%          3.000000
97%          4.000000
99%          6.000000
max         57.000000
Name: reviewerID, dtype: float64

In [0]:
filtered_users=review_per_user_counts[review_per_user_counts>=3].reset_index()
filtered_users=filtered_users['index'].tolist()

In [0]:
subset_backup=subset_ratings.copy()
subset_ratings=subset_ratings[subset_ratings.reviewerID.isin(filtered_users)]
subset_ratings

In [18]:
val=subset_ratings.reviewerID.value_counts()
val=val.reset_index()
owners_10ormore=val[val.reviewerID>=10]
owners_10ormore=owners_10ormore.drop_duplicates(subset=['index'])
owners_10ormore=list(owners_10ormore['index'])

['AO21QAFMIK124',
 'A3EATLL9FW7DJL',
 'AQDUE9PHA7QRP',
 'A215GPS2IFRI7G',
 'A7BTFA87UO4TZ',
 'ASCOQL6HNN0OE',
 'A39YJK1XG5KFDA',
 'A19SHNFST11PNO',
 'A1MHPGANXXGLC8',
 'A2PVAOS0G22ERX',
 'A2D6IKTQ6ET3MC',
 'A34KC9EY3ZQ5T8',
 'A3B1FQR6HV6H42',
 'A13WLZVMT0TKA4',
 'ADP33IFGFSL6',
 'A15LAJVR0RX15T',
 'A265SU6RW4ZAPY',
 'A59G70843Q9AI',
 'A2EBN8LPMN45UA',
 'A3CQ3L2B6MIFJ4',
 'AWY308ST6QBNK',
 'A1VD0NBDD7YD83',
 'A3E57Q7V46X6X2',
 'A3NEN8FB3DTTUY',
 'A2ZMKE5T8OQRTX',
 'A2VPN54YKRUGYW',
 'A1T0W8JXB4USYB',
 'A2EOHFX3N0D3PN',
 'A38QAJSI21WSBZ',
 'A15Z0OF1PUK405',
 'AYYCQUNL175D1',
 'A2IFSHR0Q1N2AQ',
 'A2LE9QQY1WZ48O',
 'A9SINTLKCQK90',
 'A1CU7PD07KWYMG',
 'A2V4J1HSCDDW0U',
 'A2ZIC3YV7NMK3D',
 'A29HZN0M869FUF',
 'A3VHXTBI5AJLUC',
 'AQC9HSJXMQ9YA',
 'A293CWG23K191L',
 'A2JPECIQW26LCX',
 'A32BY4BXZSITRP',
 'A2NF3AZ6GSIBEY',
 'A3S28GU71U1QQU',
 'A34YRFDBVMO68Y',
 'A1L6P9CTC27ZNS',
 'A2S21P9G28MK2M',
 'A1DE9TVBOVWXP1',
 'A2FD0FNAKPYWLQ',
 'A3UJRQF8Z39J3S',
 'AZQIL47UT2WV5',
 'A315XZ2MJJ4SDH',
 'AQR

In [19]:
subset_ratings=subset_ratings[subset_ratings.reviewerID.isin(owners_10ormore)]
subset_ratings.shape

(2774, 3)

In [0]:
def create_matrix(rating_data, users_column, items_column, ratings_column, threshold=None):
    """
    creates the sparse user-item interaction matrix.
    """
    if threshold is not None:
        rating_data = rating_data[rating_data[ratings_column] >= threshold]
        rating_data[ratings_column] = 1
    
    for column in (items_column, users_column, ratings_column):
        rating_data[column] = rating_data[column].astype('category')

    ratings = csr_matrix((rating_data[ratings_column],
                          (rating_data[users_column].cat.codes, rating_data[items_column].cat.codes)))
    ratings.eliminate_zeros()
    return ratings, rating_data


In [21]:
items_column = 'asin'
users_column = 'reviewerID'
ratings_column = 'overall'
threshold = 3
X, df = create_matrix(subset_ratings, users_column, items_column, ratings_column, threshold)
X


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


<200x1964 sparse matrix of type '<class 'numpy.longlong'>'
	with 2629 stored elements in Compressed Sparse Row format>

In [0]:
def create_train_test(ratings_data, test_size = 0.2, seed = 1234):
    assert test_size < 1 and test_size > 0

    train_data = ratings_data.copy().todok()
    test_data = dok_matrix(train_data.shape)

    random_state = np.random.RandomState(seed)
    for u in range(ratings_data.shape[0]):
        split_index = ratings_data[u].indices
        n_splits = ceil(test_size * split_index.shape[0])
        test_index = random_state.choice(split_index, size = n_splits, replace = False)
        test_data[u, test_index] = ratings_data[u, test_index]
        train_data[u, test_index] = 0
    
    train_data, test_data = train_data.tocsr(), test.tocsr()
    return train_data, test_data

In [23]:
X_train, X_test = create_train_test(X, test_size = 0.2, seed = 1234)
print(X_train)

  (0, 306)	1
  (0, 357)	1
  (0, 484)	1
  (0, 622)	1
  (0, 930)	1
  (0, 1069)	1
  (0, 1301)	1
  (0, 1320)	1
  (1, 152)	1
  (1, 154)	1
  (1, 243)	1
  (1, 244)	1
  (1, 370)	1
  (1, 428)	1
  (1, 590)	1
  (1, 1118)	1
  (1, 1227)	1
  (1, 1322)	1
  (1, 1366)	1
  (2, 59)	1
  (2, 321)	1
  (2, 730)	1
  (2, 818)	1
  (2, 1263)	1
  (2, 1360)	1
  :	:
  (197, 80)	1
  (197, 206)	1
  (197, 395)	1
  (197, 582)	1
  (197, 1138)	1
  (197, 1434)	1
  (198, 72)	1
  (198, 215)	1
  (198, 466)	1
  (198, 610)	1
  (198, 724)	1
  (198, 919)	1
  (198, 1192)	1
  (198, 1704)	1
  (199, 90)	1
  (199, 93)	1
  (199, 94)	1
  (199, 205)	1
  (199, 279)	1
  (199, 324)	1
  (199, 940)	1
  (199, 1048)	1
  (199, 1122)	1
  (199, 1139)	1
  (199, 1735)	1


In [0]:
class BPR:

    def __init__(self, learning_rate = 0.01, n_factors = 15, no_of_iterations = 10, 
                 batch_size = 1000, reg = 0.01, seed = 1234):
        
        self.reg = reg
        self.seed = seed
        self.no_of_iterations = no_of_iterations
        self.n_factors = n_factors
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self._prediction = None
        
    def fit(self, ratings):

        indptr = ratings.indptr
        indices = ratings.indices
        no_of_users, no_of_items = ratings.shape
        
        batch_size = self.batch_size
        if no_of_users < batch_size:
            batch_size = no_of_users
            sys.stderr.write('WARNING: Batch size is greater than number of users, switching to a batch size of {}\n'.format(no_of_users))

        no_of_iterations = no_of_users // batch_size
        
        random_state = np.random.RandomState(self.seed)
        self.user_factors = random_state.normal(size = (no_of_users, self.n_factors))
        self.item_factors = random_state.normal(size = (no_of_items, self.n_factors))
        
        loop = range(self.no_of_iterations)
        
        for _ in loop:
            for _ in range(no_of_iterations):
                sampled_users, sampled_positive_items, sampled_negative_items = self._sample(no_of_users, no_of_items, indices, indptr)
                self._update(sampled_users, sampled_positive_items, sampled_negative_items)

        return self
    
    def _sample(self, no_of_users, no_of_items, indices, indptr):
        
        sampled_positive_items = np.zeros(self.batch_size, dtype = np.int)
        sampled_negative_items = np.zeros(self.batch_size, dtype = np.int)
        sampled_users = np.random.choice(no_of_users, size = self.batch_size, replace = False)

        for idx, user in enumerate(sampled_users):
            pos_items = indices[indptr[user]:indptr[user + 1]]
            if not pos_items.any():
              continue

            pos_item = np.random.choice(pos_items)
            neg_item = np.random.choice(no_of_items)
            while neg_item in pos_items:
                neg_item = np.random.choice(no_of_items)

            sampled_positive_items[idx] = pos_item
            sampled_negative_items[idx] = neg_item

        return sampled_users, sampled_positive_items, sampled_negative_items
                
    def _update(self, u, i, j):

        user_u = self.user_factors[u]
        item_i = self.item_factors[i]
        item_j = self.item_factors[j]
        
        r_uij = np.sum(user_u * (item_i - item_j), axis = 1)
        sigmoid = np.exp(-r_uij) / (1.0 + np.exp(-r_uij))
        
        sigmoid_tiled = np.tile(sigmoid, (self.n_factors, 1)).T
        grad_u = sigmoid_tiled * (item_j - item_i) + self.reg * user_u
        grad_i = sigmoid_tiled * -user_u + self.reg * item_i
        grad_j = sigmoid_tiled * user_u + self.reg * item_j
        self.user_factors[u] -= self.learning_rate * grad_u
        self.item_factors[i] -= self.learning_rate * grad_i
        self.item_factors[j] -= self.learning_rate * grad_j
        return self

    def predict(self):

        if self._prediction is None:
            self._prediction = self.user_factors.dot(self.item_factors.T)

        return self._prediction

    def _predict_user(self, user):
        
        user_pred = self.user_factors[user].dot(self.item_factors.T)
        return user_pred

    def recommend(self, ratings, N = 5):

        no_of_users = ratings.shape[0]
        recommendation = np.zeros((no_of_users, N), dtype = np.uint32)
        for user in range(no_of_users):
            top_n = self._recommend_user(ratings, user, N)
            recommendation[user] = top_n

        return recommendation

    def _recommend_user(self, ratings, user, N):

        scores = self._predict_user(user)
        liked = set(ratings[user].indices)
        count = N + len(liked)
        if count < scores.shape[0]:
            ids = np.argpartition(scores, -count)[-count:]
            best_ids = np.argsort(scores[ids])[::-1]
            best = ids[best_ids]
        else:
            best = np.argsort(scores)[::-1]

        top_n = list(islice((rec for rec in best if rec not in liked), N))
        return top_n
    
    def get_similar_items(self, N = 5, item_ids = None):

        normed_factors = normalize(self.item_factors)
        knn = NearestNeighbors(n_neighbors = N + 1, metric = 'euclidean')
        knn.fit(normed_factors)

        if item_ids is not None:
            normed_factors = normed_factors[item_ids]

        _, items = knn.kneighbors(normed_factors)
        similar_items = items[:, 1:].astype(np.uint32)
        return similar_items

In [25]:
bpr_params = {'reg': 0.01,
              'learning_rate': 0.1,
              'no_of_iterations': 50,
              'n_factors': 15,
              'batch_size': 20}

bpr = BPR(**bpr_params)
bpr.fit(X_train)

BPR:  36%|███▌      | 18/50 [00:00<00:00, 179.65it/s]

Ratings    (0, 306)	1
  (0, 357)	1
  (0, 484)	1
  (0, 622)	1
  (0, 930)	1
  (0, 1069)	1
  (0, 1301)	1
  (0, 1320)	1
  (1, 152)	1
  (1, 154)	1
  (1, 243)	1
  (1, 244)	1
  (1, 370)	1
  (1, 428)	1
  (1, 590)	1
  (1, 1118)	1
  (1, 1227)	1
  (1, 1322)	1
  (1, 1366)	1
  (2, 59)	1
  (2, 321)	1
  (2, 730)	1
  (2, 818)	1
  (2, 1263)	1
  (2, 1360)	1
  :	:
  (197, 80)	1
  (197, 206)	1
  (197, 395)	1
  (197, 582)	1
  (197, 1138)	1
  (197, 1434)	1
  (198, 72)	1
  (198, 215)	1
  (198, 466)	1
  (198, 610)	1
  (198, 724)	1
  (198, 919)	1
  (198, 1192)	1
  (198, 1704)	1
  (199, 90)	1
  (199, 93)	1
  (199, 94)	1
  (199, 205)	1
  (199, 279)	1
  (199, 324)	1
  (199, 940)	1
  (199, 1048)	1
  (199, 1122)	1
  (199, 1139)	1
  (199, 1735)	1
---------------------
Loop  BPR:   0%|          | 0/50 [00:00<?, ?it/s]
200 1964 [ 306  357  484 ... 1122 1139 1735] [   0    8   19   27   38   55   61   72   88   96  110  118  126  134
  142  162  170  180  188  201  209  218  230  241  249  257  265  275
  285  293  301

BPR: 100%|██████████| 50/50 [00:00<00:00, 167.39it/s]


<__main__.BPR at 0x7f59d5071b00>

In [0]:
def auc_score(model, ratings):

    auc = 0.0
    no_of_users, no_of_items = ratings.shape
    for user, row in enumerate(ratings):
        y_pred = model._predict_user(user)
        y_true = np.zeros(no_of_items)
        y_true[row.indices] = 1
        auc += roc_auc_score(y_true, y_pred)

    auc /= no_of_users
    return auc

In [27]:
print(auc_score(bpr, X_train))
print(auc_score(bpr, X_test))

0.8148883142438771
0.5213257226387825


In [0]:
ans = bpr.get_similar_items(N = 10)

In [0]:
asin_codes = df["asin"].cat.codes
user_codes = df["reviewerID"].cat.codes

In [0]:

item_rmc = {}
for index, value in enumerate(ans):
  key_item = asin_codes[asin_codes == index].index.tolist()
  if not key_item:
    continue
  key_item = key_item[0]
  key_item = df.loc[[key_item]]

  key = key_item["asin"].values[0]
  rcm = []
  for item in value:
      item_value = asin_codes[asin_codes == item].index.tolist()
      if not item_value:
        continue

      item_value = item_value[0]
      item_value = df.loc[[item_value]]
      rcm.append(item_value.asin.values[0])
  item_rmc[key] = rcm

In [0]:
ans_user = bpr.recommend(X_train, N=10)

In [0]:

user_rmc = {}
for index, value in enumerate(ans_user):
  key_item = asin_codes[user_codes == index].index.tolist()
  if not key_item:
    continue
  key_item = key_item[0]
  key_item = df.loc[[key_item]]

  key = key_item["reviewerID"].values[0]
  rcm = []
  for item in value:
      item_value = asin_codes[asin_codes == item].index.tolist()
      if not item_value:
        continue

      item_value = item_value[0]
      item_value = df.loc[[item_value]]
      rcm.append(item_value.asin.values[0])
  user_rmc[key] = rcm