In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf 
from sklearn.preprocessing import MinMaxScaler


2022-03-18 13:51:10.545457: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-18 13:51:10.545494: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
book = pd.read_csv('clubs_book.csv', encoding="latin-1")
user = pd.read_csv('clubs_user.csv', encoding="latin-1")
rating = pd.read_csv('ratings.csv', sep=';', encoding="latin-1")

### merging data and filtering by rating

In [4]:
# merge user rating and book data
book_rating = pd.merge(rating, book, on='ISBN')

In [5]:
book_rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,id,title,author,publisher,publication_year
0,276964,451180054,0,58,Trial by Fire,Nancy Taylor Rosenberg,Signet Book,1996
1,6323,451180054,0,58,Trial by Fire,Nancy Taylor Rosenberg,Signet Book,1996
2,11676,451180054,7,58,Trial by Fire,Nancy Taylor Rosenberg,Signet Book,1996
3,32195,451180054,0,58,Trial by Fire,Nancy Taylor Rosenberg,Signet Book,1996
4,36606,451180054,0,58,Trial by Fire,Nancy Taylor Rosenberg,Signet Book,1996


In [6]:
# filtering books that have had => 15 ratings

rating_count = (book_rating.
     groupby(by = ['title'])['Book-Rating'].
     count().
     reset_index().
     rename(columns = {'Book-Rating': 'RatingCount_book'})
     [['title', 'RatingCount_book']]
    )

threshold = 15
rating_count = rating_count.query('RatingCount_book >= @threshold')

In [7]:
book_rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,id,title,author,publisher,publication_year
0,276964,451180054,0,58,Trial by Fire,Nancy Taylor Rosenberg,Signet Book,1996
1,6323,451180054,0,58,Trial by Fire,Nancy Taylor Rosenberg,Signet Book,1996
2,11676,451180054,7,58,Trial by Fire,Nancy Taylor Rosenberg,Signet Book,1996
3,32195,451180054,0,58,Trial by Fire,Nancy Taylor Rosenberg,Signet Book,1996
4,36606,451180054,0,58,Trial by Fire,Nancy Taylor Rosenberg,Signet Book,1996


In [8]:
# filtering users that are actively rating

user_rating = pd.merge(rating_count, book_rating, left_on='title', right_on='title', how='left')

user_count = (user_rating.
     groupby(by = ['User-ID'])['Book-Rating'].
     count().
     reset_index().
     rename(columns = {'Book-Rating': 'RatingCount_user'})
     [['User-ID', 'RatingCount_user']]
    )
    
threshold = 1
user_count = user_count.query('RatingCount_user >= @threshold')

combined = user_rating.merge(user_count, left_on = 'User-ID', right_on = 'User-ID', how = 'inner')

print('Number of unique books: ', combined['title'].nunique())
print('Number of unique users: ', combined['User-ID'].nunique())

Number of unique books:  16
Number of unique users:  344


In [9]:
# normalise the rating feature

scaler = MinMaxScaler()
combined['Book-Rating'] = combined['Book-Rating'].values.astype(float)
rating_scaled = pd.DataFrame(scaler.fit_transform(combined['Book-Rating'].values.reshape(-1,1)))
combined['Book-Rating'] = rating_scaled

### create user - book matrix

In [10]:

# creating user - book matrix 

combined = combined.drop_duplicates(['User-ID', 'title'])
user_book_matrix = combined.pivot(index='User-ID', columns='title', values='Book-Rating')
user_book_matrix.fillna(0, inplace=True)
users = user_book_matrix.index.tolist()
books = user_book_matrix.columns.tolist()
user_book_matrix = user_book_matrix.to_numpy()

In [11]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [12]:
# set up of network parameters, such as dimensions of each hidden layer 

num_input = combined['title'].nunique()
num_hidden_1 = 10
num_hidden_2 = 5

# initialise the TensorFlow placeholder
X = tf.placeholder(tf.float64, [None, num_input])

# weights and biases are randomly initialised
weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

### build the encode / decode model & create predictions

In [13]:
# building the encode & decode model 

def encoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2

def decoder(x):
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2

In [14]:
# construct model and create predictions

encoder_op = encoder(X)
decoder_op = decoder(encoder_op)
y_pred = decoder_op
y_true = X

In [15]:
# define loss function and optimiser, minimise the squared error, and define the evaluation metrics
loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.RMSPropOptimizer(0.03).minimize(loss)
eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [16]:
# initialising placeholders and variables
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()

# df for storing predictions
pred_data = pd.DataFrame() 

### training the model

In [17]:
with tf.Session() as session:
    epochs = 10
    batch_size = 2

    session.run(init)
    session.run(local_init)

    num_batches = int(user_book_matrix.shape[0] / batch_size)
    user_book_matrix = np.array_split(user_book_matrix, num_batches)
    
    for i in range(epochs):

        avg_cost = 0
        for batch in user_book_matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches

        print("epoch: {} Loss: {}".format(i + 1, avg_cost))

    user_book_matrix = np.concatenate(user_book_matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: user_book_matrix})

    pred_data = pred_data.append(pd.DataFrame(preds))

    pred_data = pred_data.stack().reset_index(name='Book-Rating')
    pred_data.columns = ['User-ID', 'title', 'Book-Rating']
    pred_data['User-ID'] = pred_data['User-ID'].map(lambda value: users[value])
    pred_data['title'] = pred_data['title'].map(lambda value: books[value])
    
    keys = ['User-ID', 'title']
    index_1 = pred_data.set_index(keys).index
    index_2 = combined.set_index(keys).index

2022-03-18 13:51:19.653846: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-03-18 13:51:19.654856: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-18 13:51:19.655155: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (chiaseed): /proc/driver/nvidia/version does not exist
2022-03-18 13:51:19.658529: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


epoch: 1 Loss: 0.13777965161750408
epoch: 2 Loss: 0.01646940456335277
epoch: 3 Loss: 0.014739177476490853
epoch: 4 Loss: 0.013246198771052741
epoch: 5 Loss: 0.011827640052996325
epoch: 6 Loss: 0.010399301171610417
epoch: 7 Loss: 0.008893033049029919
epoch: 8 Loss: 0.007623793237054236
epoch: 9 Loss: 0.006580519071518253
epoch: 10 Loss: 0.00593684148824925


  pred_data = pred_data.append(pd.DataFrame(preds))


### testing the model

In [18]:
# top ten books by user
top_ten_ranked = pred_data[~index_1.isin(index_2)] 
top_ten_ranked = top_ten_ranked.sort_values(['User-ID', 'Book-Rating'], ascending=[True, False])
top_ten_ranked = top_ten_ranked.groupby('User-ID').head(10)

In [19]:
# top ten for chosen user
top_ten_ranked.loc[top_ten_ranked['User-ID'] == 882]

Unnamed: 0,User-ID,title,Book-Rating
3,882,Commitments (Vintage Contemporaries),0.007767
15,882,Trial by Fire,0.007218
11,882,Scandal,0.006234
10,882,Paranoia : A Novel,0.004398
8,882,Mr. X,0.003733
1,882,Chicken Soup for the Kid's Soul : 101 Stories ...,0.003024
7,882,Midnight Sun,0.000874
6,882,Little Women (Illustrated Junior Library),0.000654
9,882,Oliver Twist (Wordsworth Classics),0.000232
4,882,Handyman,0.000156


In [20]:
# other books rated by chosen user

book_rating.loc[book_rating['User-ID'] == 882].sort_values(by=['Book-Rating'], ascending=False)

Unnamed: 0,User-ID,ISBN,Book-Rating,id,title,author,publisher,publication_year
56,882,671644475,0,166,The Conquest,Jude Deveraux,Pocket,1991


In [21]:
pred_data.head(235)

Unnamed: 0,User-ID,title,Book-Rating
0,882,Along Came a Spider,0.000010
1,882,Chicken Soup for the Kid's Soul : 101 Stories ...,0.003024
2,882,Coffee Will Make You Black,0.000006
3,882,Commitments (Vintage Contemporaries),0.007767
4,882,Handyman,0.000156
...,...,...,...
230,16795,Little Women (Illustrated Junior Library),0.000654
231,16795,Midnight Sun,0.000874
232,16795,Mr. X,0.003733
233,16795,Oliver Twist (Wordsworth Classics),0.000232


In [22]:
# the often read books from prediction df
most_read_books = pd.DataFrame(pred_data["title"].value_counts())
most_read_books

Unnamed: 0,title
Along Came a Spider,344
"Chicken Soup for the Kid's Soul : 101 Stories of Courage, Hope and Laughter (Chicken Soup for the Soul (Paperback Health Communications))",344
Coffee Will Make You Black,344
Commitments (Vintage Contemporaries),344
Handyman,344
Human Stain,344
Little Women (Illustrated Junior Library),344
Midnight Sun,344
Mr. X,344
Oliver Twist (Wordsworth Classics),344
