Data from [UCSD](https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home)
 + contains 2.3 million books from 2017 and their shelves
 + Mengting Wan, Julian McAuley, "Item Recommendation on Monotonic Behavior Chains", in RecSys'18.  [bibtex]
 + "Books" [Dataset](https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home#h.p_GBrT1fh_1XTD) 

In [44]:
import pandas as pd
import numpy as np

pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
df = pd.read_json("goodreads_books.json", lines=True)

In [22]:
# This is a fairly large DF, want to get a sense of the memory footprint before we go further
df.memory_usage()

Index                        128
isbn                    18885240
text_reviews_count      18885240
series                  18885240
country_code            18885240
language_code           18885240
popular_shelves         18885240
asin                    18885240
is_ebook                18885240
average_rating          18885240
kindle_asin             18885240
similar_books           18885240
description             18885240
format                  18885240
link                    18885240
authors                 18885240
publisher               18885240
num_pages               18885240
publication_day         18885240
isbn13                  18885240
publication_month       18885240
edition_information     18885240
publication_year        18885240
url                     18885240
image_url               18885240
book_id                 18885240
ratings_count           18885240
work_id                 18885240
title                   18885240
title_without_series    18885240
dtype: int

In [5]:
# usage: 522.3+ MB
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2360655 entries, 0 to 2360654
Data columns (total 29 columns):
 #   Column                Dtype 
---  ------                ----- 
 0   isbn                  object
 1   text_reviews_count    object
 2   series                object
 3   country_code          object
 4   language_code         object
 5   popular_shelves       object
 6   asin                  object
 7   is_ebook              object
 8   average_rating        object
 9   kindle_asin           object
 10  similar_books         object
 11  description           object
 12  format                object
 13  link                  object
 14  authors               object
 15  publisher             object
 16  num_pages             object
 17  publication_day       object
 18  isbn13                object
 19  publication_month     object
 20  edition_information   object
 21  publication_year      object
 22  url                   object
 23  image_url             object
 24

In [8]:
df.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')

In [11]:
df.shape
# 2.3 million rows, 29 columns

(2360655, 29)

In [10]:
df.head()

Unnamed: 0,isbn,text_reviews_count,series,country_code,language_code,popular_shelves,asin,is_ebook,average_rating,kindle_asin,...,publication_month,edition_information,publication_year,url,image_url,book_id,ratings_count,work_id,title,title_without_series
0,312853122.0,1,[],US,,"[{'count': '3', 'name': 'to-read'}, {'count': ...",,False,4.0,,...,9.0,,1984.0,https://www.goodreads.com/book/show/5333265-w-...,https://images.gr-assets.com/books/1310220028m...,5333265,3,5400751,W.C. Fields: A Life on Film,W.C. Fields: A Life on Film
1,743509986.0,6,[],US,,"[{'count': '2634', 'name': 'to-read'}, {'count...",,False,3.23,B000FC0PBC,...,10.0,Abridged,2001.0,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,1333909,10,1323437,Good Harbor,Good Harbor
2,,7,[189911],US,eng,"[{'count': '58', 'name': 'to-read'}, {'count':...",B00071IKUY,False,4.03,,...,,Book Club Edition,1987.0,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,7327624,140,8948723,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","The Unschooled Wizard (Sun Wolf and Starhawk, ..."
3,743294297.0,3282,[],US,eng,"[{'count': '7615', 'name': 'to-read'}, {'count...",,False,3.49,B002ENBLOK,...,7.0,,2009.0,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,6066819,51184,6243154,Best Friends Forever,Best Friends Forever
4,850308712.0,5,[],US,,"[{'count': '32', 'name': 'to-read'}, {'count':...",,False,3.4,,...,,,,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,287140,15,278577,Runic Astrology: Starcraft and Timekeeping in ...,Runic Astrology: Starcraft and Timekeeping in ...


In [41]:
# checking counts/null values and replacing nulls to be calculated
df.replace(r'^\s*$', np.nan, regex=True, inplace = True)

In [45]:
# checking non-numerical fields
df.groupby(['text_reviews_count']).size().reset_index(name='count')

Unnamed: 0,text_reviews_count,count
0,0,727
1,1,585463
2,10,46371
3,100,856
4,1000,9
5,1001,5
6,10018,1
7,1002,9
8,1003,8
9,10031,1


In [50]:
# Convert string columns to ints and floats for calculation
df['text_reviews_count'] = pd.to_numeric(df['text_reviews_count']).astype('float')
df['average_rating'] = df['average_rating'].astype('float')
df['publication_month'] = df['publication_month'].astype('float')
df['ratings_count'] = df['ratings_count'].astype('float')

In [52]:
# Percent null values for each column
df.isnull().sum()* 100 / len(df)

isbn                    41.656786
text_reviews_count       0.022197
series                   0.000000
country_code             0.020757
language_code           44.909273
popular_shelves          0.000000
asin                    80.110732
is_ebook                 0.020757
average_rating           0.022197
kindle_asin             57.006424
similar_books            0.000000
description             17.462653
format                  27.397227
link                     0.022197
authors                  0.000000
publisher               27.719510
num_pages               32.369533
publication_day         43.395964
isbn13                  33.052818
publication_month       37.402543
edition_information     90.764724
publication_year        25.400789
url                      0.022197
image_url                0.020757
book_id                  0.000000
ratings_count            0.022197
work_id                  0.022197
title                    0.000297
title_without_series     0.000297
dtype: float64