# Goodreads books recommender system

This notebook creates a clean data from the existing data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
from ast import literal_eval
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

## Books

In [2]:
books_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/books_enriched.csv', index_col=[0], converters={"genres": literal_eval})

In [3]:
len(books_df)

10000

In [4]:
# use goodreads book id instead of the arbitrary id
books_df["book_id"] = books_df.goodreads_book_id

In [5]:
# Dropping unneeded columns
books_df = books_df.drop(columns=["description", "image_url", "small_image_url", "index", "authors_2", "best_book_id", "goodreads_book_id"], errors = 'ignore')

In [6]:
# Keep only english books.
books_df = books_df[books_df.language_code == 'eng']

In [7]:
# drop books with duplicated title
books_df = books_df.drop_duplicates(subset = ['title'])

In [8]:
# transform list columns to list
books_df['authors'] = books_df['authors'].apply(lambda x: x.strip('[]').replace("'","").split(", "))

## Ratings

In [9]:
ratings_df = pd.read_csv('reviews_updated2.csv')

In [10]:
# remove index column
ratings_df = ratings_df.drop(columns=["Unnamed: 0"])

In [11]:
# convert date to timestamp, and keep only the date
ratings_df["date_added"] = pd.to_datetime(ratings_df.date_added, format='%Y/%m/%d', utc = True)
ratings_df["date_added"] = ratings_df.date_added.dt.date

In [13]:
# filter out reviews preceding goodreads site
ratings_df = ratings_df[ratings_df.date_added >= pd.Timestamp(year=2000, month=1, day=1)]

  ratings_df = ratings_df[ratings_df.date_added >= pd.Timestamp(year=2000, month=1, day=1)]


In [17]:
# filter out 0 reviews
ratings_df = ratings_df.loc[ratings_df['rating'] != 0]

In [None]:
i = 0
while True:
    i += 1
    l = len(ratings_df)
    
    # remove users with low number of reviews (<75)
    user_review_counts = ratings_df['user_id'].value_counts()
    ratings_df = ratings_df[ratings_df['user_id'].isin(user_review_counts[user_review_counts >= 75].index)]
    
    # filter out books with low number of reviews (<75)
    book_review_counts = ratings_df.groupby('book_id').count()
    popular_books = book_review_counts[book_review_counts['user_id'] >= 75].index.tolist()
    ratings_df = ratings_df[ratings_df['book_id'].isin(popular_books)]
    
    # remove the removed books from ratings 
    ratings_df = ratings_df[ratings_df.book_id.isin(books_df.book_id)]
    if len(ratings_df) == l:
        break
    
print('number of iters:', i)

In [None]:
# filtering books which are no longer in the dataset
books_df = books_df[books_df['book_id'].isin(ratings_df['book_id'])]

In [None]:
len(ratings_df["user_id"].unique()), len(ratings_df), len(books_df)

### Other tables
Don't know if these will be needed for our purpose:
- **tags** - Shelf name (genre) and count of the books in that shelf
- **book_tags** - Shelves with the count for each book
- **to_read** - books to read for user

In [16]:
tags_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/tags.csv')

In [17]:
book_tags_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/book_tags.csv')

In [18]:
to_read_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/to_read.csv')

## Splits

In [19]:
%%time

tmp = ratings_df.sort_values(by='date_added')

rows_by_user_id = dict()
for index, row in tmp.iterrows():
    if row.user_id not in rows_by_user_id:
        rows_by_user_id[row.user_id] = []
    rows_by_user_id[row.user_id].append(row)

train, test = [], []
for users_ratings in rows_by_user_id.values():
    indx = int(len(users_ratings) * 0.8)
    train.extend(users_ratings[:indx])
    test.extend(users_ratings[indx:])

train, test = pd.DataFrame(train), pd.DataFrame(test)

CPU times: user 1min 16s, sys: 1.07 s, total: 1min 17s
Wall time: 1min 18s


## Saving the data

In [20]:
ratings_df.to_csv('ratings.csv', index=False)

In [21]:
books_df.to_csv('books.csv', index=False)

In [22]:
train.to_csv('train.csv', index=False)

In [23]:
test.to_csv('test.csv', index=False)