# Goodreads books recommender system

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
from ast import literal_eval
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

In [2]:
books_df = pd.read_csv('books.csv',converters={"genres": literal_eval})
ratings_df = pd.read_csv('ratings.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Books

In [3]:
genres = set(books_df["genres"].explode())
genres

{'art',
 'biography',
 'books',
 'business',
 'chick-lit',
 'christian',
 'classics',
 'comics',
 'contemporary',
 'cookbooks',
 'crime',
 'fantasy',
 'fiction',
 'gay-and-lesbian',
 'graphic-novels',
 'historical-fiction',
 'history',
 'horror',
 'humor-and-comedy',
 'manga',
 'memoir',
 'music',
 'mystery',
 'nonfiction',
 'paranormal',
 'philosophy',
 'poetry',
 'psychology',
 'religion',
 'romance',
 'science',
 'science-fiction',
 'self-help',
 'spirituality',
 'sports',
 'suspense',
 'thriller',
 'travel',
 'young-adult'}

In [4]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9185 entries, 0 to 9184
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   authors                    9185 non-null   object 
 1   average_rating             9185 non-null   float64
 2   book_id                    9185 non-null   int64  
 3   books_count                9185 non-null   int64  
 4   genres                     9185 non-null   object 
 5   isbn                       8603 non-null   object 
 6   isbn13                     8687 non-null   float64
 7   language_code              9185 non-null   object 
 8   original_publication_year  9172 non-null   float64
 9   original_title             8687 non-null   object 
 10  pages                      9133 non-null   float64
 11  publishDate                9178 non-null   object 
 12  ratings_1                  9185 non-null   int64  
 13  ratings_2                  9185 non-null   int64

In [None]:
# most rated books
books_df.sort_values('ratings_count', ascending = False)[["title","authors","average_rating", "ratings_count"]].iloc[0:20]

In [None]:
# authors of top 100 rated books
top_100 = books_df.sort_values(['average_rating'], ascending = False).iloc[0:100,:]

# transform authors to list and get top 15 authors that appear in top 100 books
authors_top_100 = pd.Series([x for item in top_100['authors'] for x in item]).value_counts().to_frame('counts').reset_index()
authors_top_100 = authors_top_100.iloc[0:15,:]

In [None]:
sns.set_style('darkgrid')
sns.set_palette('cividis')

In [None]:
ax = sns.histplot(data=books_df, x="average_rating", bins=20)
ax.set_title("Distribution of Book Ratings")
ax.set_xlabel("Average Rating")
ax.set_ylabel("Number of books")
plt.show()

In [None]:
temp_books = pd.DataFrame()
temp_books["ratings_count_log"]=np.log(books_df.ratings_count)
ax = sns.histplot(data = temp_books, x = "ratings_count_log")
ax.set_title("Distribution of Book Ratings Count")
ax.set_xlabel("Log of number of ratings")
ax.set_ylabel("NUmber of books")
sns.despine()
plt.show()

In [None]:
ax = sns.scatterplot(data = books_df, x = "ratings_count", y = "average_rating")
ax.set(xlim=(0, 1000000))
ax.set_title("Scatterplot of average book rating vs number of ratings")
ax.set_xlabel("Number of ratings")
ax.set_ylabel("Rating")
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{(x/1000000)}M'))
sns.regplot(data=books_df, x="ratings_count", y="average_rating", scatter=False, color='r')

In [None]:
columns_to_plot = ['average_rating', 'ratings_count']
fig, axes = plt.subplots(ncols=len(columns_to_plot))
for column, axis in zip(columns_to_plot, axes):
    if column == 'ratings_count':
        sns.boxplot(data=books_df[column], ax=axis).set(ylim=(0, 100000))
        axis.set_title('Number of ratings')
    else:
        sns.boxplot(data=books_df[column], ax=axis)
        axis.set_title('Average rating')
plt.tight_layout()
plt.show()

## Ratings

In [None]:
ratings_df.head()

In [None]:
ratings_df.info()

In [None]:
len(ratings_df["book_id"].unique())

In [None]:
ratings_df[["rating"]].describe()

In [None]:
ratings_df.groupby('user_id')['book_id'].count().head()

In [None]:
ax = sns.boxplot(y=ratings_df.groupby('user_id')['book_id'].count(), orient="v")
ax.set_ylim(0, 100)
ax.set_ylabel("Number of ratings")
ax.set_title("Number of ratings by user")

## Naive Baseline

In [None]:
import math

mean = train.groupby('book_id').mean()
id_score_dict = mean['rating'].to_dict()

sum = 0
n = 0
for index, row in test.iterrows():
    id = row['book_id']
    if id in id_score_dict:
        sum += (row['rating'] - id_score_dict[id])**2
        n += 1

math.sqrt(sum/n)

## Collaborative filtering

In [None]:
len(books_df)

9185

In [6]:
len(train["user_id"].unique())

54440

In [8]:
book_review_df = books_df.merge(test, on='book_id')
book_users_df = book_review_df.groupby('book_id')['user_id'].apply(list).reset_index(name='user_ids')

In [9]:
book_users_df

Unnamed: 0,book_id,user_ids
0,1,"[e5645006eb9dc54b37c742ec842e3ec9, ad238a3aaea..."
1,2,"[5a152597df16e2b9d387a767480af601, d8755cbb00e..."
2,3,"[e2f930d586b780501ef795e8593be700, 74d655d91b6..."
3,5,"[eef49e1e3c6233e9d86fd10ff4700b83, 9453d59ff8e..."
4,6,"[5a152597df16e2b9d387a767480af601, ad238a3aaea..."
...,...,...
9174,31538635,"[12707977a55df3ec3deaf86e874fe3ad, b130d74091d..."
9175,31538647,"[b130d74091d1340b1fa44c6f48c4b0d6, 79a62a7cf08..."
9176,31845516,"[d67d2ed6c7a0260e50dca7052af5f0ff, 3d1b305a2b9..."
9177,32075671,"[eb8c4639c4271c719b9e8bb9241fb2d3, 2b8eb89cb40..."


In [12]:
test.loc([0,1])

TypeError: unhashable type: 'list'