# Goodreads books recommender system

## Preprocessing

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
from ast import literal_eval
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

### Books

#### Cleaning

In [None]:
books_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/books_enriched.csv', index_col=[0], converters={"genres": literal_eval})

In [None]:
# use goodreads book id instead of the arbitrary id
books_df["book_id"] = books_df.goodreads_book_id

In [None]:
# Dropping unneeded columns
books_df = books_df.drop(columns=["description", "image_url", "small_image_url", "index", "authors_2", "best_book_id", "goodreads_book_id"], errors = 'ignore')

In [None]:
genres = set(books_df["genres"].explode())
genres

In [None]:
# Keep only english books.
books_df = books_df[books_df.language_code == 'eng']

In [None]:
# drop books with duplicated title
books_df = books_df.drop_duplicates(subset = ['title'])

In [None]:
# transform list columns to list
books_df['authors'] = books_df['authors'].apply(lambda x: x.strip('[]').replace("'","").split(", "))

#### EDA

In [None]:
books_df.head()

In [None]:
books_df.info()

In [None]:
# most rated books
books_df.sort_values('ratings_count', ascending = False)[["title","authors","average_rating", "ratings_count"]].iloc[0:20]

In [None]:
# authors of top 100 rated books
top_100 = books_df.sort_values(['average_rating'], ascending = False).iloc[0:100,:]

# transform authors to list and get top 15 authors that appear in top 100 books
authors_top_100 = pd.Series([x for item in top_100['authors'] for x in item]).value_counts().to_frame('counts').reset_index()
authors_top_100 = authors_top_100.iloc[0:15,:]

In [None]:
sns.set_style('darkgrid')
sns.set_palette('cividis')

In [None]:
ax = sns.histplot(data=books_df, x="average_rating", bins=20)
ax.set_title("Distribution of Book Ratings")
ax.set_xlabel("Average Rating")
ax.set_ylabel("Number of books")
plt.show()

In [None]:
temp_books = pd.DataFrame()
temp_books["ratings_count_log"]=np.log(books_df.ratings_count)
ax = sns.histplot(data = temp_books, x = "ratings_count_log")
ax.set_title("Distribution of Book Ratings Count")
ax.set_xlabel("Log of number of ratings")
ax.set_ylabel("NUmber of books")
sns.despine()
plt.show()

In [None]:
ax = sns.scatterplot(data = books_df, x = "ratings_count", y = "average_rating")
ax.set(xlim=(0, 1000000))
ax.set_title("Scatterplot of average book rating vs number of ratings")
ax.set_xlabel("Number of ratings")
ax.set_ylabel("Rating")
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{(x/1000000)}M'))
sns.regplot(data=books_df, x="ratings_count", y="average_rating", scatter=False, color='r')

In [None]:
columns_to_plot = ['average_rating', 'ratings_count']
fig, axes = plt.subplots(ncols=len(columns_to_plot))
for column, axis in zip(columns_to_plot, axes):
    if column == 'ratings_count':
        sns.boxplot(data=books_df[column], ax=axis).set(ylim=(0, 100000))
        axis.set_title('Number of ratings')
    else:
        sns.boxplot(data=books_df[column], ax=axis)
        axis.set_title('Average rating')
plt.tight_layout()
plt.show()

### Ratings

In [None]:
ratings_df = pd.read_csv('reviews_updated2.csv')

In [None]:
# remove index column
ratings_df = ratings_df.drop(columns=["Unnamed: 0"])

In [None]:
ratings_df.head()

In [None]:
ratings_df.info()

In [None]:
len(ratings_df["user_id"].unique())

~35k users

In [None]:
ratings_df[["rating"]].describe()

In [None]:
ratings_df.groupby('user_id')['book_id'].count().head()

In [None]:
ax = sns.boxplot(y=ratings_df.groupby('user_id')['book_id'].count(), orient="v")
ax.set_ylim(0, 100)
ax.set_ylabel("Number of ratings")
ax.set_title("Number of ratings by user")

### Other tables
Don't know if these will be needed for our purpose:
- **tags** - Shelf name (genre) and count of the books in that shelf
- **book_tags** - Shelves with the count for each book
- **to_read** - books to read for user

In [None]:
tags_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/tags.csv')

In [None]:
book_tags_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/book_tags.csv')

In [None]:
to_read_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/to_read.csv')

## Splits

there is no temporal data about the reviews, hence random is the best we can do... (I think)

In [None]:
train, test = train_test_split(ratings_df, test_size=0.2)
pivot_table = train.pivot_table(index='user_id', columns='book_id', values='rating')
matrix = pivot_table.to_numpy()

## Naive Baseline

fills the gaps with median

In [None]:
def naive(matrix):
    medians = []
    for i in range(matrix.shape[1]):
        medians.append(np.nanmedian(matrix[:,i]))
    
    medians = []
    for i in range(matrix.shape[1]):
        medians.append(np.nanmedian(matrix[:,i]))
    medians = np.array(medians)
    
    for i,row in enumerate(matrix):
        matrix[i][np.isnan(row)] = medians[np.isnan(row)]
    return matrix

## Testing

dummy testing function, because it will change based on how the new data will look like

In [None]:
def evaluate(matrix):
    return 10

In [None]:
evaluate(naive(matrix.copy()))
