In [1]:
#import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler

In [2]:
#import data
rating = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
user = pd.read_csv('data/BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
book = pd.read_csv('data/BX-Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")

#merge books with their ratings using ISBN
book_rating = pd.merge(rating, book, on='ISBN')

#specify columns to drop
cols = ['Year-Of-Publication', 'Publisher', 'Book-Author', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']

#drop 
book_rating.drop(cols, axis=1, inplace=True)
book_rating.head()

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,276725,034545104X,0,Flesh Tones: A Novel
1,2313,034545104X,5,Flesh Tones: A Novel
2,6543,034545104X,0,Flesh Tones: A Novel
3,8680,034545104X,5,Flesh Tones: A Novel
4,10314,034545104X,9,Flesh Tones: A Novel


In [None]:
#Group book_rating by Book-Title
#Count Book-Ratings for each book
rating_count = (book_rating.
               groupby(by = ['Book-Title'])['Book-Rating'].
               count().
               reset_index().
               rename(columns = {'Book-Rating': 'RatingCount_book'})
               [['Book-Title', 'RatingCount_book']]
               )

In [10]:
rating_count.head(20)

Unnamed: 0,Book-Title,RatingCount_book
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
5,Clifford Visita El Hospital (Clifford El Gran...,1
6,Dark Justice,1
7,Deceived,2
8,Earth Prayers From around the World: 365 Pray...,10
9,Final Fantasy Anthology: Official Strategy Gu...,4


In [12]:
#filter out books that have less than 15 ratings
threshold = 15
rating_count = rating_count.query('RatingCount_book >= @threshold')
rating_count.head(20)

Unnamed: 0,Book-Title,RatingCount_book
75,'Salem's Lot,47
203,10 Lb. Penalty,61
422,101 Dalmatians,37
673,"14,000 Things to Be Happy About",28
697,16 Lighthouse Road,65
764,1984,284
818,1st to Die: A Novel,509
913,2001: A Space Odyssey,25
946,2010: Odyssey Two,90
955,204 Rosewood Lane,71


In [13]:
#merge rating_count with book_rating by Book-Title
user_rating = pd.merge(rating_count, book_rating, left_on='Book-Title', right_on='Book-Title', how='left')

In [16]:
user_rating.head(10)

Unnamed: 0,Book-Title,RatingCount_book,User-ID,ISBN,Book-Rating
0,'Salem's Lot,47,8936,067103975X,0
1,'Salem's Lot,47,172245,067103975X,0
2,'Salem's Lot,47,189835,067103975X,5
3,'Salem's Lot,47,9226,0451168089,0
4,'Salem's Lot,47,33283,0451168089,10
5,'Salem's Lot,47,37950,0451168089,0
6,'Salem's Lot,47,55734,0451168089,0
7,'Salem's Lot,47,56044,0451168089,8
8,'Salem's Lot,47,59727,0451168089,0
9,'Salem's Lot,47,60263,0451168089,10


In [17]:
#group by User-ID
#find number of ratings each user has given
user_count = (user_rating.
             groupby(by = ["User-ID"])["Book-Rating"].
             count().
             reset_index().
             rename(columns = {'Book-Rating': "RatingCount_user"})
             [['User-ID', 'RatingCount_user']]
             )

In [18]:
user_count.head(20)

Unnamed: 0,User-ID,RatingCount_user
0,8,2
1,9,2
2,10,1
3,14,1
4,16,2
5,17,4
6,19,1
7,23,1
8,26,2
9,32,2


In [19]:
#get users with more than 15 ratings
threshold = 15

user_count = user_count.query('RatingCount_user >= @threshold')
user_count.head()

Unnamed: 0,User-ID,RatingCount_user
52,243,68
54,254,139
69,383,17
89,487,21
96,507,61


In [20]:
#merge user_rating with user_count
#see all the books that the user has rated before
combined = user_rating.merge(user_count, left_on = 'User-ID', right_on = 'User-ID', how = 'inner')

In [21]:
combined.head(10)

Unnamed: 0,Book-Title,RatingCount_book,User-ID,ISBN,Book-Rating,RatingCount_user
0,'Salem's Lot,47,8936,067103975X,0,177
1,1st to Die: A Novel,509,8936,0446610038,0,177
2,A Case of Need,236,8936,0451210638,0,177
3,A Perfect Stranger,54,8936,0440168724,0,177
4,Accident,126,8936,0440217547,0,177
5,All I Need Is You,60,8936,0380762609,0,177
6,All That Remains (Kay Scarpetta Mysteries (Pap...,184,8936,0380718332,9,177
7,BODY FARM,50,8936,0684195976,0,177
8,Bag of Bones,195,8936,067102423X,0,177
9,Best Of Enemies,37,8936,1551662779,8,177


In [22]:
print('Number of unique books: ', combined['Book-Title'].nunique())
print('Number of unique users: ', combined['User-ID'].nunique())

Number of unique books:  5854
Number of unique users:  4121


In [23]:
#normalise ratings using minmax

#instantiate scaler
scaler = MinMaxScaler()

#turn book ratings into float
combined['Book-Rating'] = combined['Book-Rating'].values.astype(float)

In [24]:
combined.head(20)

Unnamed: 0,Book-Title,RatingCount_book,User-ID,ISBN,Book-Rating,RatingCount_user
0,'Salem's Lot,47,8936,067103975X,0.0,177
1,1st to Die: A Novel,509,8936,0446610038,0.0,177
2,A Case of Need,236,8936,0451210638,0.0,177
3,A Perfect Stranger,54,8936,0440168724,0.0,177
4,Accident,126,8936,0440217547,0.0,177
5,All I Need Is You,60,8936,0380762609,0.0,177
6,All That Remains (Kay Scarpetta Mysteries (Pap...,184,8936,0380718332,9.0,177
7,BODY FARM,50,8936,0684195976,0.0,177
8,Bag of Bones,195,8936,067102423X,0.0,177
9,Best Of Enemies,37,8936,1551662779,8.0,177


In [28]:
rating_scaled = pd.DataFrame(scaler.fit_transform(combined['Book-Rating'].values.reshape(-1,1)))

In [29]:
combined['Book-Rating'] = rating_scaled

In [31]:
combined.shape

(279092, 6)

In [33]:
#build user-book matrix

#remove duplicates
combined = combined.drop_duplicates(['User-ID', 'Book-Title'])

#pivot from long data to wide data
user_book_matrix = combined.pivot(index='User-ID', columns='Book-Title', values='Book-Rating')

#replace na with 0
user_book_matrix.fillna(0, inplace=True)

#get list of users and list of books
users = user_book_matrix.index.tolist()
books = user_book_matrix.columns.tolist()

In [36]:
#create matrix
user_book_matrix = user_book_matrix.values

In [37]:
user_book_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [38]:
#build model

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [39]:
num_input = combined['Book-Title'].nunique()
num_hidden_1 = 10
num_hidden_2 = 5

X = tf.placeholder(tf.float64, [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

In [40]:
#build encoder and decoder
def encoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

def decoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

In [41]:
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

y_pred = decoder_op

y_true = X

In [42]:
#define loss function, optimiser, and evaluation metrics
loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)
eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [43]:
#create empty data frame to store top 10 results
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()
pred_data = pd.DataFrame()

In [44]:
#train model
with tf.Session() as session:
    epochs = 100
    batch_size = 35

    session.run(init)
    session.run(local_init)

    num_batches = int(user_book_matrix.shape[0] / batch_size)
    user_book_matrix = np.array_split(user_book_matrix, num_batches)
    
    for i in range(epochs):

        avg_cost = 0
        for batch in user_book_matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("epoch: {} Loss: {}".format(i + 1, avg_cost))

    user_book_matrix = np.concatenate(user_book_matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: user_book_matrix})

    pred_data = pred_data.append(pd.DataFrame(preds))

    pred_data = pred_data.stack().reset_index(name='Book-Rating')
    pred_data.columns = ['User-ID', 'Book-Title', 'Book-Rating']
    pred_data['User-ID'] = pred_data['User-ID'].map(lambda value: users[value])
    pred_data['Book-Title'] = pred_data['Book-Title'].map(lambda value: books[value])
    
    keys = ['User-ID', 'Book-Title']
    index_1 = pred_data.set_index(keys).index
    index_2 = combined.set_index(keys).index

    #remove books that user has already rated
    top_ten_ranked = pred_data[~index_1.isin(index_2)]
    #sort
    top_ten_ranked = top_ten_ranked.sort_values(['User-ID', 'Book-Rating'], ascending=[True, False])
    top_ten_ranked = top_ten_ranked.groupby('User-ID').head(10)

2021-09-06 23:50:49.385232: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


epoch: 1 Loss: 0.3524068408032768
epoch: 2 Loss: 0.1997118009429457
epoch: 3 Loss: 0.003909733309609513
epoch: 4 Loss: 0.0026441452354113134
epoch: 5 Loss: 0.0025320035299986727
epoch: 6 Loss: 0.002500510814352932
epoch: 7 Loss: 0.00248336023850064
epoch: 8 Loss: 0.0024725630537121212
epoch: 9 Loss: 0.002465053243586459
epoch: 10 Loss: 0.002372878230511187
epoch: 11 Loss: 0.0022858258647223315
epoch: 12 Loss: 0.0022826530506961746
epoch: 13 Loss: 0.0022801690283191637
epoch: 14 Loss: 0.0022781517885975605
epoch: 15 Loss: 0.002276483107294537
epoch: 16 Loss: 0.0022750826817180994
epoch: 17 Loss: 0.002273892727083503
epoch: 18 Loss: 0.0022728700284312805
epoch: 19 Loss: 0.0022719820889715967
epoch: 20 Loss: 0.002271204078410808
epoch: 21 Loss: 0.0022705169434412424
epoch: 22 Loss: 0.002269905801806758
epoch: 23 Loss: 0.0022693592448058166
epoch: 24 Loss: 0.002268868299503612
epoch: 25 Loss: 0.0022684260128209223
epoch: 26 Loss: 0.002268026540111591
epoch: 27 Loss: 0.0022676646804962405
e

In [57]:
#get a sample of unique user IDs to test
combined['User-ID'].unique()

array([  8936, 189835,  33283, ...,   7841,  14180, 163851])

In [59]:
#find top 10 recommended books
top_ten_ranked.loc[top_ten_ranked['User-ID'] == 14180]

Unnamed: 0,User-ID,Book-Title,Book-Rating
988230,14180,The Lovely Bones: A Novel,0.066331
987777,14180,The Da Vinci Code,0.054736
985311,14180,Harry Potter and the Chamber of Secrets (Book 2),0.040342
985315,14180,Harry Potter and the Prisoner of Azkaban (Book 3),0.039456
988535,14180,The Secret Life of Bees,0.03888
984174,14180,Bridget Jones's Diary,0.035969
985880,14180,Life of Pi,0.035051
985317,14180,Harry Potter and the Sorcerer's Stone (Harry P...,0.034827
985313,14180,Harry Potter and the Goblet of Fire (Book 4),0.033937
988459,14180,The Red Tent (Bestselling Backlist),0.033574


In [58]:
#check the usual type of books user likes
book_rating.loc[book_rating['User-ID'] == 14180].sort_values(by=['Book-Rating'], ascending=False)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
530225,14180,0812564375,9,Mount Dragon: A Novel
324198,14180,0446531421,9,Still Life With Crows
166702,14180,0812542835,9,Reliquary
197812,14180,0440235502,9,October Sky: A Memoir
377680,14180,0446608378,9,Thunderhead
530175,14180,0446523364,8,Riptide
479412,14180,0553276328,8,Pacific Vortex! (Dirk Pitt Adventures (Paperba...
401657,14180,0671519816,8,Inca Gold (Clive Cussler)
362985,14180,0688147259,8,Notes from a Small Island
102137,14180,0345353145,7,Sphere
