# Recommender System: Collaborative Filtering with Goodreads Dataset From Kaggle

Collaborative Filtering is one of recommender system's method that aims aims at learning predictive models of user preferences, interests or behavior from community data, that is, a database of available user preferences

Source: Latent semantic models for collaborative filtering

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Read the dataset books
df_books = pd.read_csv("books.csv")

#Read the ratings from user
df_ratings = pd.read_csv("ratings.csv")

df_ratings = pd.merge(df_books, df_ratings).drop(['best_book_id','books_count','isbn','isbn13','authors','original_publication_year','original_title','language_code','average_rating','ratings_count','work_ratings_count','work_text_reviews_count','ratings_1','ratings_2','ratings_3','ratings_4','ratings_5','image_url','small_image_url'], axis = 1)

pd.set_option('display.max_rows', None)

In [3]:
df_ratings.head()

Unnamed: 0,book_id,goodreads_book_id,work_id,title,user_id,rating
0,1,2767052,2792775,"The Hunger Games (The Hunger Games, #1)",2886,5
1,1,2767052,2792775,"The Hunger Games (The Hunger Games, #1)",6158,5
2,1,2767052,2792775,"The Hunger Games (The Hunger Games, #1)",3991,4
3,1,2767052,2792775,"The Hunger Games (The Hunger Games, #1)",5281,5
4,1,2767052,2792775,"The Hunger Games (The Hunger Games, #1)",5721,5


In [4]:
#Checking the Info of All Column
df_ratings.info()

print('========================')

#Checking if there any missing values on data
df_ratings.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5976479 entries, 0 to 5976478
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   book_id            int64 
 1   goodreads_book_id  int64 
 2   work_id            int64 
 3   title              object
 4   user_id            int64 
 5   rating             int64 
dtypes: int64(5), object(1)
memory usage: 319.2+ MB


book_id              0
goodreads_book_id    0
work_id              0
title                0
user_id              0
rating               0
dtype: int64

In [5]:
user_ratings = df_ratings.pivot_table(index=['user_id'], columns=['title'], values='rating')

In [6]:
#Remove Books which have less than 15 user who rated it.
user_ratings = user_ratings.dropna(thresh=15,axis=1)

In [7]:
#Fill Nan With 0
user_ratings = user_ratings.fillna(0)

In [8]:
user_ratings.head()

title,"Angels (Walsh Family, #3)","""حكايات فرغلي المستكاوي ""حكايتى مع كفر السحلاوية",#GIRLBOSS,'Salem's Lot,"'Tis (Frank McCourt, #2)","1,000 Places to See Before You Die",1/4 جرام,"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works","100 Bullets, Vol. 1: First Shot, Last Call",100 Love Sonnets,...,محال,مخطوطة بن إسحاق: مدينة الموتى,نادي السيارات,هشت کتاب,هيبتا,واحة الغروب,يوتوبيا,ڤيرتيجو,キスよりも早く1 [Kisu Yorimo Hayaku 1] (Faster than a Kiss #1),美少女戦士セーラームーン新装版 1 [Bishōjo Senshi Sailor Moon Shinsōban 1]
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#Similarity Matrix
books_similarity = user_ratings.corr(method='pearson')
books_similarity.head()

In [None]:
def sim_books(title,rating):
    sim_score = books_similarity[title]*(rating-2.5)
    sim_score = sim_score.sort_values(ascending=False)
    
    return sim_score

In [None]:
new_user = [("The Hunger Games (The Hunger Games, #1",5)]

similar_books = pd.DataFrame()

for title,rating in new_user:
    similar_books = similar_books.append(sim_books(title,rating),ignore_index=True)

similar_books.head()
similar_books.sum().sort_values(ascending=False)