# Goodreads books recommender system

In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
from ast import literal_eval
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import math
from scipy.stats import pearsonr


In [49]:
books_df = pd.read_csv('books.csv',converters={"genres": literal_eval})
ratings_df = pd.read_csv('ratings.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [50]:
# shuffling the test set
np.random.seed(0)
test = test.sample(frac = 1)

## Books

In [None]:
genres = set(books_df["genres"].explode())
genres

In [None]:
books_df.info()

In [None]:
# most rated books
books_df.sort_values('ratings_count', ascending = False)[["title","authors","average_rating", "ratings_count"]].iloc[0:20]

In [None]:
# authors of top 100 rated books
top_100 = books_df.sort_values(['average_rating'], ascending = False).iloc[0:100,:]

# transform authors to list and get top 15 authors that appear in top 100 books
authors_top_100 = pd.Series([x for item in top_100['authors'] for x in item]).value_counts().to_frame('counts').reset_index()
authors_top_100 = authors_top_100.iloc[0:15,:]

In [None]:
sns.set_style('darkgrid')
sns.set_palette('cividis')

In [None]:
ax = sns.histplot(data=books_df, x="average_rating", bins=20)
ax.set_title("Distribution of Book Ratings")
ax.set_xlabel("Average Rating")
ax.set_ylabel("Number of books")
plt.show()

In [None]:
temp_books = pd.DataFrame()
temp_books["ratings_count_log"]=np.log(books_df.ratings_count)
ax = sns.histplot(data = temp_books, x = "ratings_count_log")
ax.set_title("Distribution of Book Ratings Count")
ax.set_xlabel("Log of number of ratings")
ax.set_ylabel("NUmber of books")
sns.despine()
plt.show()

In [None]:
ax = sns.scatterplot(data = books_df, x = "ratings_count", y = "average_rating")
ax.set(xlim=(0, 1000000))
ax.set_title("Scatterplot of average book rating vs number of ratings")
ax.set_xlabel("Number of ratings")
ax.set_ylabel("Rating")
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{(x/1000000)}M'))
sns.regplot(data=books_df, x="ratings_count", y="average_rating", scatter=False, color='r')

In [None]:
columns_to_plot = ['average_rating', 'ratings_count']
fig, axes = plt.subplots(ncols=len(columns_to_plot))
for column, axis in zip(columns_to_plot, axes):
    if column == 'ratings_count':
        sns.boxplot(data=books_df[column], ax=axis).set(ylim=(0, 100000))
        axis.set_title('Number of ratings')
    else:
        sns.boxplot(data=books_df[column], ax=axis)
        axis.set_title('Average rating')
plt.tight_layout()
plt.show()

## Ratings

In [None]:
ratings_df.head()

In [None]:
ratings_df.info()

In [None]:
len(ratings_df["book_id"].unique())

In [None]:
ratings_df[["rating"]].describe()

In [None]:
ratings_df.groupby('user_id')['book_id'].count().head()

In [None]:
ax = sns.boxplot(y=ratings_df.groupby('user_id')['book_id'].count(), orient="v")
ax.set_ylim(0, 100)
ax.set_ylabel("Number of ratings")
ax.set_title("Number of ratings by user")

## Naive Baselines

### Global average

In [51]:
mean = train['rating'].mean()

sum = 0
n = 0
for index, row in test.iterrows():
    id = row['book_id']
    sum += (row['rating'] - mean)**2
    n += 1
math.sqrt(sum/n)

1.2200771565806698

### Book average

In [52]:
mean = train.groupby('book_id').mean()
book_average_dict = mean['rating'].to_dict()

sum = 0
n = 0
for index, row in test.iterrows():
    id = row['book_id']
    sum += (row['rating'] - book_average_dict[id])**2
    n += 1
math.sqrt(sum/n)

1.1837386913818102

### User average

In [53]:
mean = train.groupby('user_id').mean()
user_average_dict = mean['rating'].to_dict()

sum = 0
n = 0
for index, row in test.iterrows():
    id = row['user_id']
    sum += (row['rating'] - user_average_dict[id])**2
    n += 1
math.sqrt(sum/n)

1.129027829510486

## Collaborative filtering

We can use the p-value to determine better the similarity between the users.

[4,4], [4,4] has larger similarity than [4,4,4,3,3,4,5], [4,5,4,3,3,4,5], but the second is more preferable.

It is slow as it can be, and it does not work well (:

### Precomputing the data structures

In [6]:
# dictionary from books to list of users which rated the given book
book_user_dict = {}
grouped_data = train.groupby('book_id')
for book_id, group in grouped_data:
    user_generator = (x for x in group['user_id'])
    book_user_dict[book_id] = list(user_generator)
    
# dictionary from users to dictionary of books to rating given by the user
user_book_rating_dict = {}
grouped_data = train.groupby('user_id')
for user_id, group in grouped_data:
    book_rating_dict = {book_id: rating for book_id, rating in zip(group['book_id'], group['rating'])}
    user_book_rating_dict[user_id] = book_rating_dict

### The Algorithm

In [103]:
number_of_books_required = 10
k_neighbours = 10
corr_threshold = 0.5
p_val = True
r2 = False

def user_similarity(r1, r2):
    vals1, vals2 = [], []
    for key in r1:
        if key in r2:
            vals1.append(r1[key])
            vals2.append(r2[key])
    if len(vals1) < number_of_books_required:
        return None
    corr, p = pearsonr(vals1, vals2)
    if (p_val and p > 0.05) or np.isnan(corr) or corr < corr_threshold:
        return None
    return corr

def find_the_similars(user_id, book_id):
    similarities = []
    for user_id2 in book_user_dict[book_id]:
        similarity = user_similarity(user_book_rating_dict[user_id], user_book_rating_dict[user_id2])
        if similarity == None:
            continue
        similarities.append((similarity, user_book_rating_dict[user_id2][book_id], user_id2))
    return sorted(similarities, key=lambda x: x[0], reverse=True)

def predict(user_id, book_id):
    similarities = find_the_similars(user_id, book_id)
    if len(similarities) < k_neighbours:
        return book_average_dict[book_id], False

    summ = 0
    weights = 0
    for i in range(k_neighbours):
        sim = similarities[i][0]
        if r2:
            sim = sim * sim
        summ += (similarities[i][1]-book_average_dict[book_id]) * sim
        weights += sim
    
    return book_average_dict[book_id]+(summ / weights), True

### Debugging

In [105]:
# harry potter book
# TODO graph the values of similarities
find_the_similars('c3d033014bbcc966551905f1565137ad', 3)

[(1.0, 5, 'c3d033014bbcc966551905f1565137ad'),
 (1.0, 5, '13e399819abaa7c1e910403462153504'),
 (0.9999999999999998, 5, '52d2d23cd693ce2d304ecb65d6c2b5bf'),
 (0.9726921182982429, 4, '287a22a7ce9bddca901c63f698fbce90'),
 (0.9720615110512387, 5, '8b5996465689dad1aba3a270629abafd'),
 (0.9551094400921768, 5, 'd2e61dff9a8fbfd5f282aa8739e45222'),
 (0.9529216659791809, 5, '60e2ee724204574e04fc32517d630d65'),
 (0.9510441892119875, 5, 'fe6932eec9ecd789ef370cdca9a25e5f'),
 (0.9505863757867168, 5, 'cca791aaec9d57a183d961e0aee72208'),
 (0.9370425713316359, 5, 'fc18cfc845e161e9a7e2794014111875'),
 (0.931551641906374, 5, 'fed4694fbfe70cdbb4bb2eb207e39047'),
 (0.9211323729436764, 5, 'ee8fd39bfa6572a82bedd0b41ba5a820'),
 (0.8728715609439694, 5, '8e22737448aab764821f2520a8d5f6b6'),
 (0.8728715609439694, 5, '46dfaeb0a0fcb8cefb0ba6198c69d06f'),
 (0.8640987597877146, 5, '066a5ff58f200a53ad99bcd4e0477ce7'),
 (0.8595324232523596, 5, '405b974ff931822bbae0ec183c6c7ede'),
 (0.8563488385776754, 5, '60ec9a3f8c879

### Testing

In [100]:
%%time
# n is a counter so I can stop it early
n = 0 
s = 0
successful = 0
for _, row in test.iterrows():
    predicted, ok = predict(row['user_id'], row['book_id'])
    if ok:
       successful += 1 
    s += (row['rating']-predicted)  ** 2
    n += 1
    if n % 1000 == 0:
        print(math.sqrt(s/n), n, successful)
    if n == 10000:
        break
print(math.sqrt(s/n), n, successful)

1.180574642994932 1000 255
1.1692927395164918 2000 521
1.1667888187821824 3000 771
1.1933272072658454 4000 1023
1.1941568657014219 5000 1266
1.1916823502226026 6000 1522
1.199829544996664 7000 1771
1.196730893255919 8000 2035
1.193132893181081 9000 2293
1.1908221090836189 10000 2552
1.1908221090836189 10000 2552
CPU times: user 1min 35s, sys: 23.9 ms, total: 1min 35s
Wall time: 1min 35s
