In [1]:
import numpy as np
import json
import pickle

In [13]:
import tensorflow.keras as keras

# This method will function as a baseline model to compare with our current embedding method.

In [2]:
with open('./data/user_embedding.firefire', "rb") as file:
    user_embedding = pickle.load(file)
    file.close()

In [3]:
len( user_embedding.keys() )

51184

In [4]:
with open('./data/book_embedding.firefire', "rb") as file:
    book_embedding = pickle.load(file)
    file.close()

In [5]:
len( book_embedding.keys())

86276

# Let's generate training and testing samples

In [6]:
users = list(user_embedding.keys())
user_lookup = {users[i]:i for i in range(len(users))}

In [7]:
books = list(book_embedding.keys())
book_lookup = {books[i]:i for i in range(len(books))}

In [8]:
del users,books

In [9]:
test_x_book = []
test_x_user = []
test_y = []
train_x_book = []
train_x_user = []
train_y = []

In [10]:
np.random.seed(9001)
rand_nums = [np.random.randint(5) for _ in range(515595)]
with open('./data/goodreads_reviews_comics_graphic.json','r') as file:
    index=0
    for review in file:
        record = json.loads(review)
        if record['rating']!=0:
            flag = rand_nums[index]
            if flag==4:
                #treat it as a test data
                try:
                    book_id = book_lookup[record['book_id']]
                    user_id = user_lookup[record['user_id']]
                    test_x_book.append(book_id )
                    test_x_user.append(user_id)
                    test_y.append( record['rating'])
                    index+=1
                except:
                    continue
            else: #we save this pair
                try:
                    book_id = book_lookup[record['book_id']]
                    user_id = user_lookup[record['user_id']]
                    train_x_book.append(book_id )
                    train_x_user.append(user_id)
                    train_y.append( record['rating'])
                    index+=1
                except:
                    continue
    file.close()

# Model with bias and regularization

Note keras automatically has the bias but we also want a regularization on the bias so we add it

In [12]:
num_books=len(book_lookup)
num_users = len(user_lookup)
num_dimension=100
alpha=0.01

In [20]:
input1 = keras.Input(shape=(None,), name="book_input")
input_embe1 = keras.layers.Embedding( num_books, num_dimension, activity_regularizer=keras.regularizers.l2(alpha))(input1)
flat_embe1 = keras.layers.Flatten()(input_embe1)

input2 = keras.Input(shape=(None,),name="user_input")
input_embe2 = keras.layers.Embedding( num_users, num_dimension, activity_regularizer=keras.regularizers.l2(alpha))(input2)
flat_embe2 = keras.layers.Flatten()(input_embe2)
dot = keras.layers.Dot(axes=1)([flat_embe1,flat_embe2])

#adding bias term
bias1 = keras.layers.Embedding( num_books, 1, activity_regularizer=keras.regularizers.l2(alpha))(input1)
flat_bias1 = keras.layers.Flatten()(bias1)
bias2 = keras.layers.Embedding( num_books, 1, activity_regularizer=keras.regularizers.l2(alpha))(input2)
flat_bias2 = keras.layers.Flatten()(bias2)


#the final prediction is the addition of dot product, and two bias terms
output = keras.layers.Add()([dot,flat_bias1,flat_bias2])

model= keras.models.Model(inputs=[input1,input2],outputs=[output])
model.compile(loss="mse", optimizer="adam")

In [21]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("keras_factor_model.h5",
save_best_only=True)
early_stopping_cb = keras.callbacks.EarlyStopping(patience=5,
restore_best_weights=True)

In [None]:
model.fit( ( np.array(train_x_book).reshape( (-1,1)), np.array(train_x_user).reshape( (-1,1)) ), np.array(train_y).reshape( (-1,1)), 
          callbacks=[checkpoint_cb, early_stopping_cb], 
          validation_data=(( np.array(test_x_book).reshape( (-1,1)), np.array(test_x_user).reshape( (-1,1)) ), np.array(test_y).reshape( (-1,1))),
          epochs=20, batch_size=64)

Train on 412369 samples, validate on 103223 samples
Epoch 1/20
 93824/412369 [=====>........................] - ETA: 9:45 - loss: 14.9656