In [1]:
# libraries
import pandas as pd
import numpy as np
from matrix_factorization_tf import MF

pd.set_option('display.max_columns', 500)

In [2]:
##### import data
print('Reading Data')
df_ratings = pd.read_csv('data/ratings.csv')
df_books = pd.read_csv('data/books.csv')
df_books = df_books[['book_id', 'isbn', 'isbn13', 'authors', 'original_publication_year', 'original_title']].copy()

#### preprocess
print('Preprocessing')
cnt = df_ratings.groupby('user_id')['book_id'].count()
lower = cnt.mean() - 2*cnt.std() # lower bound of books read
users = cnt[(cnt>lower)].index # select users who've read gt lower bound
df_ratings = df_ratings[df_ratings.user_id.isin(users)]

drop_books = (df_books[df_books.original_title.isna()]['book_id']).tolist() # drop books with missing title
df_books = df_books[~df_books.book_id.isin(drop_books)].copy()
df_books.reset_index(inplace=True, drop=True)

df_ratings = df_ratings[~df_ratings.book_id.isin(drop_books)].copy()
df_ratings.reset_index(inplace=True, drop=True)

Reading Data
Preprocessing


In [3]:
R = df_ratings.pivot(index='user_id', columns='book_id', values='rating') # user rating pivoted df
df_R = R.copy()
R.fillna(0, inplace=True)
R = np.array(R)

In [4]:
sparsity = len(R[R==0]) / (R.shape[0] * R.shape[1])
sparsity

0.9882459882684972

In [5]:
#### train
recommender = MF(k = 20, rating_matrix=R, epochs = 20, learning_rate = 0.1)
recommender_train = recommender.train()

2022-03-13 21:46:56.028796: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [6]:
print(recommender.model.summary())

Model: "matrix_factorization"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 item (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 user_embedding (Embedding)     (None, 1, 20)        1040480     ['user[0][0]']                   
                                                                                                  
 item_embedding (Embedding)     (None, 1, 20)        188300      ['item[0][0]']                   
                                                                               

In [7]:
print(recommender.history.history)

{'loss': [9.489164352416992, 0.977279007434845, 0.7429345846176147, 0.7138020396232605, 0.6926698088645935, 0.6717773675918579, 0.650764524936676, 0.6300435662269592, 0.6107507944107056, 0.5940210223197937, 0.5799860954284668, 0.5682482719421387, 0.558096170425415, 0.5492408275604248, 0.5412793755531311, 0.5340884327888489, 0.5275207161903381, 0.5215716361999512, 0.5160977840423584, 0.5109972357749939], 'mae': [2.6244306564331055, 0.7654615044593811, 0.6750440001487732, 0.6603385806083679, 0.6491198539733887, 0.6379140019416809, 0.62665194272995, 0.6153124570846558, 0.6047202348709106, 0.5951724052429199, 0.587103545665741, 0.5805130004882812, 0.5748023390769958, 0.5696890354156494, 0.5652161836624146, 0.5611690282821655, 0.5575564503669739, 0.5542729496955872, 0.5512025952339172, 0.5483783483505249], 'mse': [9.489164352416992, 0.977279007434845, 0.7429345846176147, 0.7138020396232605, 0.6926698088645935, 0.6717773675918579, 0.650764524936676, 0.6300435662269592, 0.6107507944107056, 0.

In [8]:
pred_rm, pred_um, pred_im, sim_matrix = recommender.get_sim_matrix() # read predictions from checkpoint



In [9]:
entered_book_name = input("Enter book:") # Harry Potter and the Philosopher's Stone
number_of_book_recos = input("Enter # of recos:")

Enter book:great gatsby
Enter # of recos:10


In [10]:
chosen_book_name = df_books[df_books.original_title.str.lower().str.contains(entered_book_name.lower())].original_title.values[0]
chosen_book_id = df_books[df_books.original_title.str.lower().str.contains(entered_book_name.lower())].book_id.values[0]
chosen_book_index = df_R.columns.get_loc(chosen_book_id)
recommended_book_indices = list(pd.Series(sim_matrix[chosen_book_index]).sort_values(ascending=False).index[:int(number_of_book_recos)].values)
cosine_sim_scores = list(pd.Series(sim_matrix[chosen_book_index]).sort_values(ascending=False)[:int(number_of_book_recos)].values)

In [11]:
print("You've selected:", chosen_book_name, "\nBook ID:", chosen_book_id)

You've selected: The Great Gatsby 
Book ID: 5


In [12]:
recommended_book_ids = list(df_R.columns[recommended_book_indices])

In [13]:
final_recos = df_books[df_books.book_id.isin(recommended_book_ids)][['original_title', 'authors', 'original_publication_year', 'isbn']]
final_recos['cosine_sim_score'] = cosine_sim_scores
final_recos

Unnamed: 0,original_title,authors,original_publication_year,isbn,cosine_sim_score
4,The Great Gatsby,F. Scott Fitzgerald,1925.0,743273567,0.999999
143,The Goldfinch,Donna Tartt,2013.0,316055433,0.980207
178,In Cold Blood,Truman Capote,1965.0,679745580,0.980119
447,A Streetcar Named Desire,Tennessee Williams,1947.0,822210894,0.978924
512,The Things They Carried,Tim O'Brien,1990.0,767902890,0.978628
645,Dreams from My Father,Barack Obama,1995.0,1921351438,0.978479
749,The Autobiography of Malcolm X,"Malcolm X, Alex Haley",1965.0,345350685,0.978257
922,The Hours,Michael Cunningham,1998.0,312305060,0.978247
988,The Interestings,Meg Wolitzer,2013.0,1594488398,0.978017
1009,A Moveable Feast,Ernest Hemingway,1964.0,99285045,0.977866
