# Recommendation System for Books
Dataset accessed from: https://www.kaggle.com/datasets/zygmunt/goodbooks-10k

In [None]:
# Importing libraries
import pandas as pd
from math import sqrt
import numpy as np

In [None]:
# Accessing the dataset using pandas
books_df = pd.read_csv('books.csv')
ratings_df = pd.read_csv('ratings.csv')

In [None]:
# Checking what's included in the books dataset
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         10000 non-null  int64  
 1   book_id                    10000 non-null  int64  
 2   best_book_id               10000 non-null  int64  
 3   work_id                    10000 non-null  int64  
 4   books_count                10000 non-null  int64  
 5   isbn                       9300 non-null   object 
 6   isbn13                     9415 non-null   float64
 7   authors                    10000 non-null  object 
 8   original_publication_year  9979 non-null   float64
 9   original_title             9415 non-null   object 
 10  title                      10000 non-null  object 
 11  language_code              8916 non-null   object 
 12  average_rating             10000 non-null  float64
 13  ratings_count              10000 non-null  int6

In [None]:
# Creating a new dataset with only the necessary information
books2_df = books_df[['id','title']]
books2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10000 non-null  int64 
 1   title   10000 non-null  object
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


In [None]:
# Checking what's inside the ratings dataset
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 981756 entries, 0 to 981755
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   book_id  981756 non-null  int64
 1   user_id  981756 non-null  int64
 2   rating   981756 non-null  int64
dtypes: int64(3)
memory usage: 22.5 MB


In [None]:
# Creating a dataframe of user inputted ratings
userInput = [{'title':'The Little Prince', 'rating':3},
             {'title':'Emma', 'rating':5},
             {'title':'The Catcher in the Rye', 'rating':4},
             {'title':'The Fault in Our Stars', 'rating':2},
             {'title':'Pride and Prejudice', 'rating':5}]
inputBooks = pd.DataFrame(userInput)
inputBooks

Unnamed: 0,title,rating
0,The Little Prince,3
1,Emma,5
2,The Catcher in the Rye,4
3,The Fault in Our Stars,2
4,Pride and Prejudice,5


In [None]:
# Matching the title in the userInput with the titles in books2_df
# Finding the id of the book and merging it so that userInput has corresponding book ids
inputId = books2_df[books2_df['title'].isin(inputBooks['title'].tolist())]
inputBooks = pd.merge(inputId, inputBooks)
inputBooks

Unnamed: 0,id,title,rating
0,6,The Fault in Our Stars,2
1,8,The Catcher in the Rye,4
2,10,Pride and Prejudice,5
3,80,The Little Prince,3
4,171,Emma,5


In [None]:
# Creating a userSubset of users that rated the same books as the ones in inputBooks
userSubset = ratings_df[ratings_df['book_id'].isin(inputBooks['id'].tolist())]

# Counting how many users rated
userSubset.groupby('book_id').count()

Unnamed: 0_level_0,user_id,rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,100,100
8,100,100
10,100,100
80,100,100
171,100,100


In [None]:
# Create sub dataframes using groupby
# Groups by user_id where they have the same value
userSubsetGroup = userSubset.groupby(['user_id'])

# Function to take 5 elements for sorting
def take_5_elem(x):
    return len(x[1])

# Sorting users with most books in common with inputBooks
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
userSubsetGroup[0:5]

[(9246,       book_id  user_id  rating
  511         6     9246       2
  712         8     9246       4
  910        10     9246       2
  7920       80     9246       3), (11868,        book_id  user_id  rating
  718          8    11868       2
  920         10    11868       5
  7932        80    11868       2
  17021      171    11868       5), (12874,        book_id  user_id  rating
  721          8    12874       4
  925         10    12874       4
  7937        80    12874       4
  17025      171    12874       3), (17228,        book_id  user_id  rating
  731          8    17228       4
  936         10    17228       5
  7946        80    17228       5
  17039      171    17228       5), (21228,       book_id  user_id  rating
  537         6    21228       4
  743         8    21228       4
  949        10    21228       4
  7953       80    21228       4)]

In [None]:
# Pearson Colleration Dictionary
# Key: user id
# Value: coefficient
pearsonCorrelationDict = {}

# For loop that goes through every user in the subset
for name, group in userSubsetGroup:

    # Sorting by book_id
    group = group.sort_values(by='book_id')
    inputBooks = inputBooks.sort_values(by='id')

    # Getting the N value for the formula
    nRatings = len(group)

    # Getting the review scores for the common books
    temp_df = inputBooks[inputBooks['id'].isin(group['book_id'].tolist())]

    # Storing the review scores in another variable
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    
    # Calculating the pearson correlation
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    # If statements to ensure that there is correlation
    # If the denominator is not 0, divide
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)

    # If the denominator is 0 then there is no correlation
    else:
        pearsonCorrelationDict[name] = 0

In [None]:
# Creating a dataframe from the dictionary made before
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')

# Contains the similarity index calculated and the user_id
pearsonDF.columns = ['similarityIndex']
pearsonDF['user_id'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,user_id
0,0.13484,9246
1,0.904534,11868
2,-0.522233,12874
3,0.174078,17228
4,0.0,21228


In [None]:
# Getting the users with the highest similarity index
# This is done by first sorting, and then getting the first few users only
topUsers = pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,user_id
44,1.0,28824
99,1.0,22534
92,1.0,19729
78,1.0,10335
80,1.0,11691


In [None]:
# Getting the ratings of the users from the last data frame by merging
topUsersRating = topUsers.merge(ratings_df, left_on='user_id', right_on='user_id', how='inner')
topUsersRating.head(100)

Unnamed: 0,similarityIndex,user_id,book_id,rating
0,1.0,28824,10,4
1,1.0,28824,11,1
2,1.0,28824,14,3
3,1.0,28824,16,4
4,1.0,28824,17,3
...,...,...,...,...
95,1.0,28824,514,3
96,1.0,28824,515,4
97,1.0,28824,529,3
98,1.0,28824,530,4


In [None]:
# Getting a weighted rating by  multiplying the similarity index and the rating
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,user_id,book_id,rating,weightedRating
0,1.0,28824,10,4,4.0
1,1.0,28824,11,1,1.0
2,1.0,28824,14,3,3.0
3,1.0,28824,16,4,4.0
4,1.0,28824,17,3,3.0


In [None]:
# Grouping the users by the book_id and getting the sum of the values
tempTopUsersRating = topUsersRating.groupby('book_id').sum()[['similarityIndex','weightedRating']]

# Only showing the necessary columns
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,9.215702,40.686345
2,17.217446,75.367202
3,11.491932,26.911173
4,19.39239,80.833133
5,16.932493,66.296259


In [None]:
# Creating a dataframe for the recommendations
# Only shows the book id and the score
recommendation_df = pd.DataFrame()

# Calculating the weighted average recommendation score
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['book_id'] = tempTopUsersRating.index
recommendation_df

Unnamed: 0_level_0,weighted average recommendation score,book_id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.414894,1
2,4.377374,2
3,2.341745,3
4,4.168292,4
5,3.915328,5
...,...,...
9963,4.000000,9963
9966,1.000000,9966
9977,,9977
9988,5.000000,9988


In [None]:
# Sorting the recommendations by the weighted average recommendation score
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)

# Taking the top 10 recommended books
recommendation_df = recommendation_df.dropna()[0:10]
recommendation_df

Unnamed: 0_level_0,weighted average recommendation score,book_id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1211,5.0,1211
2689,5.0,2689
3625,5.0,3625
3628,5.0,3628
490,5.0,490
3650,5.0,3650
3660,5.0,3660
3228,5.0,3228
3151,5.0,3151
3135,5.0,3135


Some weighted average recommendation scores are NaN as there are books that have 0.0 for both the sum_similarityindex and sum_weightedRating. When 0 is divided by 0, it will return undefined which is why it is NaN.

In [None]:
# Creating a dataframe based on the recommendations 
# Shows the book id and the title
recommended_book = books2_df.loc[books2_df['id'].isin(recommendation_df['book_id'])]

# Making sure that the same book is not recommended twice
recommended_book = recommended_book.loc[~recommended_book.id.isin(userSubset['book_id'])]

# Top 10 recommended books
# Books with weighted average recommendation score of 5.0
recommended_book

Unnamed: 0,id,title
489,490,Maus I: A Survivor's Tale: My Father Bleeds Hi...
1210,1211,The Little Engine That Could
2688,2689,The Napping House
3134,3135,"One Summer: America, 1927"
3150,3151,Curious George Goes to the Hospital
3227,3228,Salt to the Sea
3624,3625,"The End of Faith: Religion, Terror, and the Fu..."
3627,3628,The Complete Calvin and Hobbes
3649,3650,A Supposedly Fun Thing I'll Never Do Again: E...
3659,3660,The Wake (The Sandman #10)
