# Goodreads books recommender system

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
from ast import literal_eval
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import math
from scipy.stats import pearsonr


In [2]:
books_df = pd.read_csv('books.csv',converters={"genres": literal_eval})
ratings_df = pd.read_csv('ratings.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Books

In [None]:
genres = set(books_df["genres"].explode())
genres

In [None]:
books_df.info()

In [None]:
# most rated books
books_df.sort_values('ratings_count', ascending = False)[["title","authors","average_rating", "ratings_count"]].iloc[0:20]

In [None]:
# authors of top 100 rated books
top_100 = books_df.sort_values(['average_rating'], ascending = False).iloc[0:100,:]

# transform authors to list and get top 15 authors that appear in top 100 books
authors_top_100 = pd.Series([x for item in top_100['authors'] for x in item]).value_counts().to_frame('counts').reset_index()
authors_top_100 = authors_top_100.iloc[0:15,:]

In [None]:
sns.set_style('darkgrid')
sns.set_palette('cividis')

In [None]:
ax = sns.histplot(data=books_df, x="average_rating", bins=20)
ax.set_title("Distribution of Book Ratings")
ax.set_xlabel("Average Rating")
ax.set_ylabel("Number of books")
plt.show()

In [None]:
temp_books = pd.DataFrame()
temp_books["ratings_count_log"]=np.log(books_df.ratings_count)
ax = sns.histplot(data = temp_books, x = "ratings_count_log")
ax.set_title("Distribution of Book Ratings Count")
ax.set_xlabel("Log of number of ratings")
ax.set_ylabel("NUmber of books")
sns.despine()
plt.show()

In [None]:
ax = sns.scatterplot(data = books_df, x = "ratings_count", y = "average_rating")
ax.set(xlim=(0, 1000000))
ax.set_title("Scatterplot of average book rating vs number of ratings")
ax.set_xlabel("Number of ratings")
ax.set_ylabel("Rating")
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{(x/1000000)}M'))
sns.regplot(data=books_df, x="ratings_count", y="average_rating", scatter=False, color='r')

In [None]:
columns_to_plot = ['average_rating', 'ratings_count']
fig, axes = plt.subplots(ncols=len(columns_to_plot))
for column, axis in zip(columns_to_plot, axes):
    if column == 'ratings_count':
        sns.boxplot(data=books_df[column], ax=axis).set(ylim=(0, 100000))
        axis.set_title('Number of ratings')
    else:
        sns.boxplot(data=books_df[column], ax=axis)
        axis.set_title('Average rating')
plt.tight_layout()
plt.show()

## Ratings

In [None]:
ratings_df.head()

In [None]:
ratings_df.info()

In [None]:
len(ratings_df["book_id"].unique())

In [None]:
ratings_df[["rating"]].describe()

In [None]:
ratings_df.groupby('user_id')['book_id'].count().head()

In [None]:
ax = sns.boxplot(y=ratings_df.groupby('user_id')['book_id'].count(), orient="v")
ax.set_ylim(0, 100)
ax.set_ylabel("Number of ratings")
ax.set_title("Number of ratings by user")

## Naive Baseline

In [41]:
import math

mean = train.groupby('book_id').mean()
id_score_dict = mean['rating'].to_dict()

sum = 0
n = 0
for index, row in test.iterrows():
    id = row['book_id']
    sum += (row['rating'] - id_score_dict[id])**2
    n += 1

math.sqrt(sum/n)

1.1846826946244475

RMSE=1.184

## Collaborative filtering

We can use the p-value to determine better the similarity between the users.

[4,4], [4,4] has larger similarity than [4,4,4,3,3,4,5], [4,5,4,3,3,4,5], but the second is more preferable.

It is slow as it can be, and i does not work well (:

In [4]:
# dictionary from books to list of users which rated the given book
book_user_dict = {}
grouped_data = train.groupby('book_id')
for book_id, group in grouped_data:
    user_generator = (x for x in group['user_id'])
    book_user_dict[book_id] = list(user_generator)
    
# dictionary from users to dictionary of books to rating given by the user
user_book_rating_dict = {}
grouped_data = train.groupby('user_id')
for user_id, group in grouped_data:
    book_rating_dict = {book_id: rating for book_id, rating in zip(group['book_id'], group['rating'])}
    user_book_rating_dict[user_id] = book_rating_dict

In [48]:
number_of_books_required = 10
k_neighbours = 10
corr_threshold = 0.5

def user_similarity(r1, r2):
    vals1, vals2 = [], []
    for key in r1:
        if key in r2:
            vals1.append(r1[key])
            vals2.append(r2[key])
    if len(vals1) < number_of_books_required:
        return None
    corr, _= pearsonr(vals1, vals2)
    if corr < corr_threshold:
        return None
    return corr

def predict(user_id, book_id):
    similarities = []
    for user_id2 in book_user_dict[book_id]:
        similarity = user_similarity(user_book_rating_dict[user_id], user_book_rating_dict[user_id2])
        if similarity == None or np.isnan(similarity):
            continue
        similarities.append((similarity, user_book_rating_dict[user_id2][book_id]))
    if len(similarities) < k_neighbours:
        return id_score_dict[book_id]
    similarities = sorted(similarities, key=lambda x: x[0])
    summ = 0
    weights = 0
    for i in range(k_neighbours):
        summ += similarities[i][1] * similarities[i][0]
        weights += similarities[i][0]
    
    return summ / weights

In [49]:
%%time
n = 0 # n is a counter so I can stop it early
s = 0
for _, row in test.iterrows():
    predicted = predict(row['user_id'], row['book_id'])
    s += (row['rating']-predicted)  ** 2
    n += 1
    if n % 5000 == 0:
        print(math.sqrt(s/n))
math.sqrt(s/n)

1.2121093843719974
1.1793559530071667
1.2049727133184478
1.2144514171267955
1.200410092768059
1.186977704247621
1.1965369733111204
1.1920271797160136
1.1880756753883532
1.1941289727405884
1.1903178849944884
1.190064200615332
1.187525684564741
1.1861782150412765
1.191628740518786
1.1943002568063679
1.2030055860742754
1.2000890654708891
1.2005934750746672
1.1997399595504885
1.1936268525472529
1.1909315749301925
1.1883960168062608
1.1894093753993666
1.1913638214704512
1.1920080921804708
1.1915270367743402
1.1898785427719272
1.1909575292204204
1.1958179115571281
1.1974357768813075
1.1956552087104522
1.1947537837962419
1.197737989386511
1.1989914102013095
1.1980059528499067
1.1988703090027244
1.1988771821075783
1.1983726003827715
1.2023245376696772
1.2038564906383458
1.2042909491479457
1.2048591770306194
1.2063378524185893
1.2056011169104257
1.2046798690867753
1.204997721511733
1.2052849705751323
1.2048735890604323
1.2074869301429698
1.2071470371233055
1.2070313133744432
1.2110211851229327


1.188589830308722

1.1885

In [None]:
len(ratings_df["user_id"].unique())

In [43]:
len(test)

635993