# Goodreads books recommender system

## Preprocessing

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
from ast import literal_eval

### Books

#### Cleaning

In [8]:
books_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/books_enriched.csv', index_col=[0], converters={"genres": literal_eval})

In [16]:
# Dropping unneeded columns
books_df = books_df.drop(columns=["description", "image_url", "small_image_url", "index", "authors_2"])

In [49]:
set(books_df["genres"].explode())

{'art',
 'biography',
 'books',
 'business',
 'chick-lit',
 'christian',
 'classics',
 'comics',
 'contemporary',
 'cookbooks',
 'crime',
 'fantasy',
 'fiction',
 'gay-and-lesbian',
 'graphic-novels',
 'historical-fiction',
 'history',
 'horror',
 'humor-and-comedy',
 'manga',
 'memoir',
 'music',
 'mystery',
 'nonfiction',
 'paranormal',
 'philosophy',
 'poetry',
 'psychology',
 'religion',
 'romance',
 'science',
 'science-fiction',
 'self-help',
 'spirituality',
 'sports',
 'suspense',
 'thriller',
 'travel',
 'young-adult'}

In [52]:
# Keep only english books.
books_df = books_df[books_df.language_code == 'eng']

In [54]:
# drop books with duplicated title
books_df = books_df.drop_duplicates(subset = ['title'])

In [None]:
#transform list columns to list
books_df['authors'] = books_df['authors'].apply(lambda x: x.strip('[]').replace("'","").split(", "))
books_df['genres'] = books_df['genres'].apply(lambda x: x.strip('[]').replace("'","").split(", "))

#### EDA

In [68]:
books_df.head()

Unnamed: 0,authors,average_rating,best_book_id,book_id,books_count,genres,goodreads_book_id,isbn,isbn13,language_code,...,ratings_3,ratings_4,ratings_5,ratings_count,small_image_url,title,work_id,work_ratings_count,work_text_reviews_count,authors_2
0,[Suzanne Collins],4.34,2767052,1,272,"[young-adult, fiction, fantasy, science-fictio...",2767052,439023483,9780439000000.0,eng,...,560092,1481305,2706317,4780653,https://images.gr-assets.com/books/1447303603s...,"The Hunger Games (The Hunger Games, #1)",2792775,4942365,155254,['Suzanne Collins']
1,"[J.K. Rowling, Mary GrandPré]",4.44,3,2,491,"[fantasy, fiction, young-adult, classics]",3,439554934,9780440000000.0,eng,...,455024,1156318,3011543,4602479,https://images.gr-assets.com/books/1474154022s...,Harry Potter and the Sorcerer's Stone (Harry P...,4640799,4800065,75867,"['J.K. Rowling', 'Mary GrandPré']"
2,[Stephenie Meyer],3.57,41865,3,226,"[young-adult, fantasy, romance, fiction, paran...",41865,316015849,9780316000000.0,eng,...,793319,875073,1355439,3866839,https://images.gr-assets.com/books/1361039443s...,"Twilight (Twilight, #1)",3212258,3916824,95009,['Stephenie Meyer']
3,[Harper Lee],4.25,2657,4,487,"[classics, fiction, historical-fiction, young-...",2657,61120081,9780061000000.0,eng,...,446835,1001952,1714267,3198671,https://images.gr-assets.com/books/1361975680s...,To Kill a Mockingbird,3275794,3340896,72586,['Harper Lee']
4,[F. Scott Fitzgerald],3.89,4671,5,1356,"[classics, fiction, historical-fiction, romance]",4671,743273567,9780743000000.0,eng,...,606158,936012,947718,2683664,https://images.gr-assets.com/books/1490528560s...,The Great Gatsby,245494,2773745,51992,['F. Scott Fitzgerald']


In [69]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9645 entries, 0 to 9999
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   authors                    9645 non-null   object 
 1   average_rating             9645 non-null   float64
 2   best_book_id               9645 non-null   int64  
 3   book_id                    9645 non-null   int64  
 4   books_count                9645 non-null   int64  
 5   genres                     9645 non-null   object 
 6   goodreads_book_id          9645 non-null   int64  
 7   isbn                       9026 non-null   object 
 8   isbn13                     9110 non-null   float64
 9   language_code              9645 non-null   object 
 10  original_publication_year  9625 non-null   float64
 11  original_title             9085 non-null   object 
 12  pages                      9576 non-null   float64
 13  publishDate                9637 non-null   objec

In [66]:
# most rated books
books_df.sort_values('ratings_count', ascending = False)[["title","authors","average_rating", "ratings_count"]].iloc[0:20]

Unnamed: 0,title,authors,average_rating,ratings_count
0,"The Hunger Games (The Hunger Games, #1)",['Suzanne Collins'],4.34,4780653
1,Harry Potter and the Sorcerer's Stone (Harry P...,"['J.K. Rowling', 'Mary GrandPré']",4.44,4602479
2,"Twilight (Twilight, #1)",['Stephenie Meyer'],3.57,3866839
3,To Kill a Mockingbird,['Harper Lee'],4.25,3198671
4,The Great Gatsby,['F. Scott Fitzgerald'],3.89,2683664
5,The Fault in Our Stars,['John Green'],4.26,2346404
6,The Hobbit,['J.R.R. Tolkien'],4.25,2071616
7,The Catcher in the Rye,['J.D. Salinger'],3.79,2044241
9,Pride and Prejudice,['Jane Austen'],4.24,2035490
8,"Angels & Demons (Robert Langdon, #1)",['Dan Brown'],3.85,2001311


In [67]:
#most popular authors


#authors of top 100 rated books
top_100 = books_df.sort_values(['average_rating'], ascending = False).iloc[0:100,:]

# transform authors to list and get top 15 authors that appear in top 100 books
authors_top_100 = pd.Series([x for item in top_100['authors'] for x in item]).value_counts().to_frame('counts').reset_index()
authors_top_100 = authors_top_100.iloc[0:15,:]

### Ratings

In [18]:
ratings_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/ratings.csv')

In [23]:
ratings_df.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [24]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5976479 entries, 0 to 5976478
Data columns (total 3 columns):
 #   Column   Dtype
---  ------   -----
 0   user_id  int64
 1   book_id  int64
 2   rating   int64
dtypes: int64(3)
memory usage: 136.8 MB


In [29]:
len(ratings_df["user_id"].unique())

53424

~50k users

### Other tables
Don't know if these will be needed for our purpose:
- **tags** - Shelf name (genre) and count of the books in that shelf
- **book_tags** - Shelves with the count for each book
- **to_read** - books to read for user

In [19]:
tags_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/tags.csv')

In [20]:
book_tags_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/book_tags.csv')

In [21]:
to_read_df = pd.read_csv('https://raw.githubusercontent.com/malcolmosh/goodbooks-10k/master/to_read.csv')