# Collaborative Filtering Book Recommender System

In [22]:
# import dependencies
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [23]:
# read datasets
books_df = pd.read_csv('/Users/reddragon/Documents/SMU/Y2S2/IS215 Digital Business/project /book_recommendation_dataset/Books.csv', nrows=10000)
users_df = pd.read_csv('/Users/reddragon/Documents/SMU/Y2S2/IS215 Digital Business/project /book_recommendation_dataset/Users.csv')
ratings_df = pd.read_csv('/Users/reddragon/Documents/SMU/Y2S2/IS215 Digital Business/project /book_recommendation_dataset/Ratings.csv')

In [24]:
books_df.head()
# users_df.head()
# ratings_df.head()
books_df.shape

(10000, 8)

In [25]:
user_ratings_df = ratings_df.merge(users_df, left_on = 'User-ID', right_on = 'User-ID')
user_ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Location,Age
0,276725,034545104X,0,"tyler, texas, usa",
1,276726,0155061224,5,"seattle, washington, usa",
2,276727,0446520802,0,"h, new south wales, australia",16.0
3,276729,052165615X,3,"rijeka, n/a, croatia",16.0
4,276729,0521795028,6,"rijeka, n/a, croatia",16.0


<h4> Collaborative Filtering </h4>

In [26]:
book_user_rating = books_df.merge(user_ratings_df, left_on = 'ISBN',right_on = 'ISBN')
book_user_rating = book_user_rating[['ISBN', 'Book-Title', 'Book-Author', 'User-ID', 'Book-Rating']]
book_user_rating.reset_index(drop=True, inplace = True)

In [27]:
d ={}
for i,j in enumerate(book_user_rating.ISBN.unique()):
    d[j] =i
book_user_rating['unique_id_book'] = book_user_rating['ISBN'].map(d)

In [28]:
users_books_pivot_matrix_df = book_user_rating.pivot(index='User-ID', 
                                                          columns='unique_id_book', 
                                                          values='Book-Rating').fillna(0)

In [29]:
users_books_pivot_matrix_df.head()

unique_id_book,0,1,2,3,4,5,6,7,8,9,...,9983,9984,9985,9986,9987,9988,9989,9990,9991,9992
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
users_books_pivot_matrix_df = users_books_pivot_matrix_df.values
users_books_pivot_matrix_df

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
from scipy.sparse.linalg import svds

NUMBER_OF_FACTORS_MF = 15

#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(users_books_pivot_matrix_df, k = NUMBER_OF_FACTORS_MF)

In [32]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [33]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ 7.80610543e-30, -2.49862865e-15, -7.28339294e-18, ...,
        -1.85062842e-15, -1.03978747e-18, -3.49894426e-18],
       [ 4.25230376e-18,  4.12103238e-03,  3.63375236e-07, ...,
         3.08583294e-03,  2.85026509e-06,  9.55878571e-06],
       [-2.56221522e-18,  4.45679132e-03,  9.16016634e-06, ...,
         2.95656178e-03,  2.39785202e-06,  8.91554893e-06],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-6.12574174e-18,  1.04995894e-02,  1.54975135e-05, ...,
         7.76245426e-03,  6.13067882e-06,  2.10459545e-05],
       [-6.60955579e-17, -1.72873501e-03,  4.31139482e-05, ...,
        -1.43453494e-03, -5.90788780e-06, -2.46316681e-05]])

<h4> Functions </h4>

In [34]:
def top_cosine_similarity(data, book_id, top_n=10):
    index = book_id 
    book_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(book_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

def similar_books(book_user_rating, book_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    book_user_rating[book_user_rating.unique_id_book == book_id]['Book-Title'].values[0]))
    for id in top_indexes + 1:
        print(book_user_rating[book_user_rating.unique_id_book == id]['Book-Title'].values[0])

<h4> Output </h4>

In [35]:
k = 50
movie_id =1234  
top_n = 3
sliced = Vt.T[:, :k] # representative data

similar_books(book_user_rating, movie_id, top_cosine_similarity(sliced, movie_id, top_n))

Recommendations for Der Fanger Im Roggen: 

Demian. Die Geschichte von Emil Sinclairs Jugend.
Schattenkinder.
Girlfriend in a Coma
