In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sns.set_theme(style="whitegrid")

# Notes

If we use tags as predictors, we will end up with a lot of predictors.
Hence, we need some sort of variable selection method.
We can use lasso, stepwise, etc.

In [3]:
books = pd.read_csv("./goodbooks-10k/books.csv")
book_tags = pd.read_csv("./goodbooks-10k/book_tags.csv")
ratings = pd.read_csv("./goodbooks-10k/ratings.csv")
tags = pd.read_csv("./goodbooks-10k/tags.csv")
to_read = pd.read_csv("goodbooks-10k/to_read.csv")

In [4]:
print(len(ratings.user_id.unique()))

53424


In [5]:
ratings.describe()

Unnamed: 0,user_id,book_id,rating
count,5976479.0,5976479.0,5976479.0
mean,26224.46,2006.477,3.919866
std,15413.23,2468.499,0.9910868
min,1.0,1.0,1.0
25%,12813.0,198.0,3.0
50%,25938.0,885.0,4.0
75%,39509.0,2973.0,5.0
max,53424.0,10000.0,5.0


In [6]:
len(ratings.user_id.unique())

53424

In [7]:
books.describe()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn13,original_publication_year,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5
count,10000.0,10000.0,10000.0,10000.0,10000.0,9415.0,9979.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,5264697.0,5471214.0,8646183.0,75.7127,9755044000000.0,1981.987674,4.002191,54001.24,59687.32,2919.9553,1345.0406,3110.885,11475.8938,19965.7,23789.81
std,2886.89568,7575462.0,7827330.0,11751060.0,170.470728,442861900000.0,152.576665,0.254427,157370.0,167803.8,6124.378132,6635.626263,9717.123578,28546.449183,51447.36,79768.89
min,1.0,1.0,1.0,87.0,1.0,195170300.0,-1750.0,2.47,2716.0,5510.0,3.0,11.0,30.0,323.0,750.0,754.0
25%,2500.75,46275.75,47911.75,1008841.0,23.0,9780316000000.0,1990.0,3.85,13568.75,15438.75,694.0,196.0,656.0,3112.0,5405.75,5334.0
50%,5000.5,394965.5,425123.5,2719524.0,40.0,9780452000000.0,2004.0,4.02,21155.5,23832.5,1402.0,391.0,1163.0,4894.0,8269.5,8836.0
75%,7500.25,9382225.0,9636112.0,14517750.0,67.0,9780831000000.0,2011.0,4.18,41053.5,45915.0,2744.25,885.0,2353.25,9287.0,16023.5,17304.5
max,10000.0,33288640.0,35534230.0,56399600.0,3455.0,9790008000000.0,2017.0,4.82,4780653.0,4942365.0,155254.0,456191.0,436802.0,793319.0,1481305.0,3011543.0


In [8]:
books.apply(lambda x: len(x.unique()), axis = 0)

book_id                      10000
goodreads_book_id            10000
best_book_id                 10000
work_id                      10000
books_count                    597
isbn                          9301
isbn13                        9154
authors                       4664
original_publication_year      294
original_title                9275
title                         9964
language_code                   26
average_rating                 184
ratings_count                 9003
work_ratings_count            9053
work_text_reviews_count       4581
ratings_1                     2630
ratings_2                     4117
ratings_3                     6972
ratings_4                     7762
ratings_5                     8103
image_url                     6669
small_image_url               6669
dtype: int64

In [9]:
ratings.shape

(5976479, 3)

In [10]:
tags.shape

(34252, 2)

In [11]:
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [12]:
(
    book_tags.merge(tags, on = "tag_id")
    .query("tag_id != 30574")
    .query("count > 5000")
    .sort_values("count", ascending=False)
)
     

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
30182,2548866,8717,77785,currently-reading
25759,24280,8717,64311,currently-reading
20359,2767052,11557,50755,favorites
14244,3,11557,48220,favorites
26313,38447,8717,47919,currently-reading
...,...,...,...,...
30974,7090447,8717,5013,currently-reading
38065,7126,11743,5012,fiction
12413,887877,11305,5011,fantasy
38866,18512,11743,5010,fiction


In [21]:
len(books.work_id.unique())

10000

In [23]:
len(books.goodreads_book_id.unique())

10000

## Tags

- There are 
- Some tags should be ignored
    - ex: currently-reading, to-read
- Some tags are equivalent but spelled differently
    - ex: favorites, favourites, F a v o r i t e s

- Choose p tags as predictors

In [13]:
TAGS_TO_IGNORE = {30574: "to-read", 
                  8717: "currently-reading",
                 }

popular_tags = (
    book_tags.merge(tags, on = "tag_id")
    .query("tag_id not in @TAGS_TO_IGNORE.keys()")
    .query("count > 2000")
    .tag_name
    .unique()
)

# Top raters

In [14]:
ratings

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3
...,...,...,...
5976474,49925,510,5
5976475,49925,528,4
5976476,49925,722,4
5976477,49925,949,5


In [15]:
(
    ratings
    .groupby("user_id", as_index=False)
    .count()
    .sort_values("rating", ascending=False)
)

Unnamed: 0,user_id,book_id,rating
30943,30944,200,200
12873,12874,200,200
52035,52036,199,199
12380,12381,199,199
28157,28158,199,199
...,...,...,...
32127,32128,21,21
40752,40753,21,21
51724,51725,21,21
43674,43675,20,20


In [16]:
top_user = 30944

## Tags per book

In [17]:
(
    book_tags
    .merge(tags, on="tag_id")
    .groupby(["goodreads_book_id"], as_index=False)
    .count()
    .sort_values("count", ascending=False)
)

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name
0,1,100,100,100
6668,6344097,100,100,100
6661,6338619,100,100,100
6662,6339304,100,100,100
6663,6339664,100,100,100
...,...,...,...,...
3335,91571,100,100,100
9999,33288638,100,100,100
4650,280958,94,94,94
9220,18607805,62,62,62


# Top tags

In [18]:
top_tags = (
    book_tags
    .merge(books, on="goodreads_book_id")
    .filter(items=["count", "goodreads_book_id", "tag_id", "authors", "ratings_count", "original_title"])
    .merge(tags, on="tag_id")
    .groupby("tag_name", as_index=False)
    .sum()
    .sort_values("count", ascending=False)
    #.query("tag_name != 'to-read'")
    .head(10)
)

In [19]:
top_tags

Unnamed: 0,tag_name,count,goodreads_book_id,tag_id,ratings_count
30574,to-read,140718761,52428278826,305220242,539430823
8717,currently-reading,7507958,50864864557,85217392,514150222
11557,favorites,4503173,52111252697,114194717,524652206
11743,fiction,3688819,48136824753,106826071,512622984
11305,fantasy,3548157,19754086947,48147995,273906403
33114,young-adult,1848306,21650946753,120203820,290167626
7457,classics,1756920,1611584464,20767745,229013859
5207,books-i-own,1317235,51362896962,51023393,519363456
26138,romance,1231926,28792484719,111112638,248335972
22743,owned,1224279,51887174731,224200494,527289733
