# Goodreads books recommender system

This notebook creates a clean data from the existing data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
from ast import literal_eval
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

## Books

In [2]:
books_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/books_enriched.csv', index_col=[0], converters={"genres": literal_eval})

In [3]:
# use goodreads book id instead of the arbitrary id
books_df["book_id"] = books_df.goodreads_book_id

In [4]:
# Dropping unneeded columns
books_df = books_df.drop(columns=["description", "image_url", "small_image_url", "index", "authors_2", "best_book_id", "goodreads_book_id"], errors = 'ignore')

In [5]:
# Keep only english books.
books_df = books_df[books_df.language_code == 'eng']

In [6]:
# drop books with duplicated title
books_df = books_df.drop_duplicates(subset = ['title'])

In [7]:
# transform list columns to list
books_df['authors'] = books_df['authors'].apply(lambda x: x.strip('[]').replace("'","").split(", "))

## Ratings

In [8]:
ratings_df = pd.read_csv('reviews_updated2.csv')

In [9]:
# remove index column
ratings_df = ratings_df.drop(columns=["Unnamed: 0"])

In [10]:
# convert date to timestamp, and keep only the date
ratings_df["date_added"] = pd.to_datetime(ratings_df.date_added, format='%Y/%m/%d', utc = True)
ratings_df["date_added"] = ratings_df.date_added.dt.date

In [11]:
# filter out reviews preceding goodreads site
ratings_df = ratings_df[ratings_df.date_added >= pd.Timestamp(year=2000, month=1, day=1)]

  result = libops.scalar_compare(x.ravel(), y, op)


In [12]:
i = 0
while True:
    i += 1
    l = len(ratings_df)
    
    # remove users with low number of reviews (<20)
    user_review_counts = ratings_df['user_id'].value_counts()
    ratings_df = ratings_df[ratings_df['user_id'].isin(user_review_counts[user_review_counts >= 20].index)]
    
    # filter out books with low number of reviews (<20)
    book_review_counts = ratings_df.groupby('book_id').count()
    popular_books = book_review_counts[book_review_counts['user_id'] >= 20].index.tolist()
    ratings_df = ratings_df[ratings_df['book_id'].isin(popular_books)]
    
    # remove the removed books from ratings 
    ratings_df = ratings_df[ratings_df.book_id.isin(books_df.book_id)]
    if len(ratings_df) == l:
        break
    
print('number of iters:', i)

number of iters: 4


### Other tables
Don't know if these will be needed for our purpose:
- **tags** - Shelf name (genre) and count of the books in that shelf
- **book_tags** - Shelves with the count for each book
- **to_read** - books to read for user

In [13]:
tags_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/tags.csv')

In [14]:
book_tags_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/book_tags.csv')

In [15]:
to_read_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/to_read.csv')

## Splits

In [16]:
%%time

tmp = ratings_df.sort_values(by='date_added')

rows_by_user_id = dict()
for index, row in tmp.iterrows():
    if row.user_id not in rows_by_user_id:
        rows_by_user_id[row.user_id] = []
    rows_by_user_id[row.user_id].append(row)

train, test = [], []
for users_ratings in rows_by_user_id.values():
    indx = int(len(users_ratings) * 0.8)
    train.extend(users_ratings[:indx])
    test.extend(users_ratings[indx:])

train, test = pd.DataFrame(train), pd.DataFrame(test)

CPU times: user 3min 5s, sys: 2.63 s, total: 3min 8s
Wall time: 3min 8s


In [17]:
train

Unnamed: 0,user_id,book_id,rating,date_added
3496706,12707977a55df3ec3deaf86e874fe3ad,438492,5,2001-02-02
3496705,12707977a55df3ec3deaf86e874fe3ad,3398625,4,2011-09-20
3496701,12707977a55df3ec3deaf86e874fe3ad,7445,5,2012-01-01
3496702,12707977a55df3ec3deaf86e874fe3ad,5556595,4,2012-01-01
3496703,12707977a55df3ec3deaf86e874fe3ad,7937843,4,2012-01-01
...,...,...,...,...
4460857,4c4574a066856ce7fb70e7a009f5184f,7260188,5,2017-11-02
4460838,4c4574a066856ce7fb70e7a009f5184f,8755785,4,2017-11-02
4460839,4c4574a066856ce7fb70e7a009f5184f,8755776,3,2017-11-02
4460840,4c4574a066856ce7fb70e7a009f5184f,6752378,3,2017-11-02


## Saving the data

In [23]:
ratings_df.to_csv('ratings.csv', index=False)

In [24]:
books_df.to_csv('books.csv', index=False)

In [25]:
train.to_csv('train.csv', index=False)

In [26]:
test.to_csv('test.csv', index=False)