In [2]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen
from sklearn.model_selection import train_test_split
import tensorflow.compat.v1 as tf
import numpy as np
import math
import heapq
from tqdm import tqdm
import random

In [3]:
!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Digital_Music_5.json.gz

--2022-05-06 20:57:42--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Digital_Music_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19408584 (19M) [application/octet-stream]
Saving to: ‘Digital_Music_5.json.gz.3’


2022-05-06 20:57:43 (27.8 MB/s) - ‘Digital_Music_5.json.gz.3’ saved [19408584/19408584]



I used Amazon music review dataset. This script is also build on the given example for dataset 

In [4]:
### load the meta data

data = []
with gzip.open('Digital_Music_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print(len(data))

# first row of the list
print(data[0])

169781
{'overall': 5.0, 'vote': '3', 'verified': True, 'reviewTime': '06 3, 2013', 'reviewerID': 'A2TYZ821XXK2YZ', 'asin': '3426958910', 'style': {'Format:': ' Audio CD'}, 'reviewerName': 'Garrett', 'reviewText': 'This is awesome to listen to, A must-have for all Slayer fans..sadly needed to be a triple disc set..They have so many hits!!', 'summary': 'Slayer Rules!', 'unixReviewTime': 1370217600}


In [5]:
# convert list into pandas dataframe

df = pd.DataFrame.from_dict(data)

print(len(df))

169781


In [6]:
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5.0,3.0,True,"06 3, 2013",A2TYZ821XXK2YZ,3426958910,{'Format:': ' Audio CD'},Garrett,"This is awesome to listen to, A must-have for ...",Slayer Rules!,1370217600,
1,5.0,,True,"10 11, 2014",A3OFSREZADFUDY,3426958910,{'Format:': ' Audio CD'},Ad,bien,Five Stars,1412985600,
2,5.0,,True,"02 11, 2014",A2VAMODP8M77NG,3426958910,{'Format:': ' Audio CD'},JTGabq,It was great to hear the old stuff again and I...,SLAYER!!!!!!!!!!!!!!!!!!!!!,1392076800,
3,4.0,3.0,False,"12 7, 2013",AAKSLZ9IDTEH0,3426958910,{'Format:': ' Audio CD'},john F&#039;n doe,well best of's are a bit poison normally but t...,slayer greatest hits! you mean everything righ...,1386374400,
4,5.0,,True,"06 12, 2016",A3OH43OZJLKI09,5557706259,{'Format:': ' Audio CD'},melinda a goodman,What can I say? This is Casting Crowns!!!This ...,"This is a good, blessing filled",1465689600,


In [7]:
# convert ids to numbers
df['reviewer_id_number'] = df['reviewerID'].astype("category").cat.codes
df['item_id_number'] = df['asin'].astype("category").cat.codes

In [8]:
# create lookups
item_lookup = df[['item_id_number', 'asin']].drop_duplicates()
item_lookup['item_id_number'] = item_lookup.item_id_number.astype(str)

reviewer_lookup = df[['reviewer_id_number', 'reviewerID']].drop_duplicates()
reviewer_lookup['reviewer_id_number'] = reviewer_lookup.reviewer_id_number.astype(str)

In [9]:
# drop unnecessary columns
df =df [["reviewer_id_number", "item_id_number", "overall"]]

In [10]:
# Create training and test sets.
df_train, df_test = train_test_split(df)

In [11]:
# Create lists of all unique users and artists
reviewers = list(np.sort(df.reviewer_id_number.unique()))
items = list(np.sort(df.item_id_number.unique()))


In [38]:
# Get the rows, columns and values for our matrix.
rows = df.reviewer_id_number.astype(int)
cols = df.item_id_number.astype(int)

values = list(df_train.overall)

# Get all user ids and item ids.
rids = np.array(rows.tolist())
iids = np.array(cols.tolist())
zipped = set(zip(rids, iids))

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import activations


In [15]:
# Create the model
reviewer_input = keras.Input(shape =(None,1))
item_input = keras.Input(shape =(None,1))

reviewer_embedded = tf.keras.layers.Embedding(200000, 1000, input_length=1)(reviewer_input)
item_embedded = tf.keras.layers.Embedding(200000, 1000, input_length=1)(item_input)
x = keras.layers.Concatenate()([reviewer_embedded, item_embedded])
x = layers.Dense(300, activation=activations.relu)(x)
x = layers.Dense(100, activation=activations.relu)(x)
x = layers.Dense(50, activation=activations.relu)(x)
y = layers.Dense(1)(x)

model = keras.Model(inputs=[reviewer_input, item_input] , outputs=y)

There are some hyperparameters for the model. I choose them using my own previous knowledge. I decrease the dense layers step by step to one. 

These parameters are also can be optimized using hyperparameter tunning implementations like sklearn.model_selection.GridSearchCV. But these methods takes too much time and computational power. So they are not always feasible.

In [17]:
model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer='adam', metrics=tf.keras.metrics.MeanSquaredError())
model.fit([df_train[["reviewer_id_number"]].to_numpy(),df_train[["item_id_number"]].to_numpy()], df_train[[ "overall"]].to_numpy(), epochs=2, batch_size=1000, verbose=1, validation_split=0.2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f6461617290>

I used mean square error as loss function. Since I am trying to predict review rating mean square error is reasonably between true ratings and predicted ratings.

In [18]:
model.evaluate([df_test[["reviewer_id_number"]].to_numpy(),df_test[["item_id_number"]].to_numpy()], df_test[[ "overall"]].to_numpy())



[0.5114557147026062, 0.5113393664360046]

In [19]:
predictionForReviewer = pd.DataFrame(items)
predictionForReviewer.rename(columns = {0:"item_id_number"}, inplace = True)
predictionForReviewer["reviewer_id_number"] = 1
predictionForReviewer.head()

Unnamed: 0,item_id_number,reviewer_id_number
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


I run the model for one user and find 5 recomendations for him/her. To decrease the computations, instead of looking for all items we can randomly choose from them.

In [20]:
predictions = model.predict([predictionForReviewer[["reviewer_id_number"]].to_numpy(),predictionForReviewer[[ "item_id_number"]].to_numpy()])

In [22]:
predictionForReviewer["predictions"]= predictions.tolist()

In [27]:
predictionForReviewer.sort_values(by=['predictions'],ascending=False,ignore_index=True, inplace=True)

In [42]:
i=0
j=0
recomendations =[]
while(i<5):
  while (1,predictionForReviewer.iloc[j]["item_id_number"]) in zipped:
    j=j+1
  recomendations.append(predictionForReviewer.iloc[j]["item_id_number"])
  i=i+1
  j=j+1

In [43]:
recomendations

[4250, 5674, 665, 10160, 6126]

For further improvements, first user-item pairs without review can be add as negative review. Thus, model will have the knowledge of which items are reviewed which are not. Right now it only uses the given ratings information so in a way every other item is assumed to be given average rating of the user.

Another improvement could be combining this model with matrix factorization.