In [224]:
import pandas as pd 
import os

### Extract a subset of Data from goodreads_books.json dataset. 
<p>Dataset link: https://mengtingwan.github.io/data/goodreads#datasets</p>
Creating a subset of this dataset to use for Book Recommendation System.

In [225]:
def extract_data(file_path, output_file_path, chunk_size):
    chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size)
    count = 10
    data = []
    for chunk in chunks:
        if count <= 0: 
            break
        count -= 1
        chunk = chunk.head(200)
        extracted = chunk[['url', 'title']].to_dict(orient='records')
        data.extend(extracted)

    result_df = pd.DataFrame(data)
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    result_df.to_json(output_file_path, orient='records', lines=True)
    print("Data saved successfully. ") 
    
    

In [226]:
extract_data("../data/goodreads_books_raw.json", "../data/goodreads_books_processed.json", 2000)

Data saved successfully. 


In [227]:
data = pd.read_json("../data/goodreads_books_processed.json", lines=True)

In [228]:
data.iloc[522].url

'https://www.goodreads.com/book/show/13586884-star-wars'

In [229]:
# for i in data.index:
#     print(data.iloc[i].url)

### Use extracted books urls to scrape data from the goodreads.com website. 
The dataset is missing a few important fields which are cruicial for the ML model.

In [230]:
## Look in data_scrapper.py file. 

### Data Transformation and processing
- Add book_id
- Fix rating count and reviews columns
- check for null values
- more....

In [231]:
data_df = pd.read_json("../goodreads_books_extracted.json")

In [232]:
data_df.head()

Unnamed: 0,url,title,author,rating_stars,rating_count,reviews,description,genres,pagecount,published_date,language
0,https://www.goodreads.com/book/show/5333265-w-...,W. C. Fields: A Life on Film,Ronald J. Fields,3.86,7ratings,1review,"Provides plot summaries, cast, credits, and st...",[],"256 pages, Paperback","First published August 1, 1984",English
1,https://www.goodreads.com/book/show/1333909.Go...,Good Harbor,Anita Diamant,3.3,"8,414ratings",960reviews,"From their first meeting, a friendship grows b...","[Fiction, Contemporary, Chick Lit, Womens, Nov...",Audio CD,"First published January 1, 2001",English
2,https://www.goodreads.com/book/show/7327624-th...,The Unschooled Wizard,Barbara Hambly,3.99,158ratings,9reviews,Omnibus book club edition containing the Ladie...,"[Fantasy, Fiction, Own, Audiobook]","600 pages, Hardcover","First published January 1, 1987",English
3,https://www.goodreads.com/book/show/6066819-be...,Best Friends Forever,Jennifer Weiner,3.51,"65,109ratings","4,052reviews",Addie Downs and Valerie Adler were eight when ...,"[Chick Lit, Fiction, Romance, Contemporary, Ad...","368 pages, Hardcover","First published July 14, 2009",English
4,https://www.goodreads.com/book/show/287140.Run...,Runic Astrology: Starcraft and Timekeeping in ...,Nigel Pennick,3.33,21ratings,6reviews,"Book by Pennick, Nigel",[],"272 pages, Paperback","First published January 1, 1990",English


In [233]:
data_df.shape

(1949, 11)

In [234]:
data_df.url.iloc[1]
int(data_df.url.iloc[i].split("/show/")[1].split("-")[0])


5333265

In [235]:
# Using regular expression 
data_df.url.str.extract(r".*?(\d+)")

Unnamed: 0,0
0,5333265
1,1333909
2,7327624
3,6066819
4,287140
...,...
1944,9481747
1945,25626179
1946,13561038
1947,60751


In [236]:
# Adding book_id column
data_df["book_id"] = data_df.url.str.extract(r".*?(\d+)")

In [237]:
data_df['rating_count'] = data_df['rating_count'].str.extract(r'(\d+(?:,\d+)*)')
data_df['rating_count'] = data_df['rating_count'].str.replace(',', '').astype(int)

data_df['reviews'] = data_df['reviews'].str.extract(r'(\d+(?:,\d+)*)')
data_df['reviews'] = data_df['reviews'].str.replace(',', '').astype(int)


In [238]:
data_df["rating_stars"] = data_df["rating_stars"].astype(float)

In [239]:
# pagecount column 

data_df["format_type"] = data_df['pagecount'].str.extract(r'pages,\s*(.*)')[0]
data_df['pagecount'] = data_df['pagecount'].str.extract(r'(\d+(?:,\d+)*)')
data_df['pagecount'] = data_df['pagecount'].astype('Int64')
data_df['format_type'] = data_df['format_type'].fillna('Not Specified')


In [240]:
data_df["format_type"]

0            Paperback
1        Not Specified
2            Hardcover
3            Hardcover
4            Paperback
             ...      
1944             ebook
1945    Kindle Edition
1946     Not Specified
1947         Paperback
1948     Not Specified
Name: format_type, Length: 1949, dtype: object

In [241]:
data_df["published_date"] #[0].split("published")



0        First published August 1, 1984
1       First published January 1, 2001
2       First published January 1, 1987
3         First published July 14, 2009
4       First published January 1, 1990
                     ...               
1944         Published November 6, 2007
1945       First published June 1, 2015
1946     First published March 13, 2012
1947    First published January 1, 1972
1948    First published January 1, 1998
Name: published_date, Length: 1949, dtype: object

In [242]:
data_df['published_date'] = data_df['published_date'].str.extract(r'(\b\w+ \d{1,2}, \d{4}\b)')
data_df['published_date'] = data_df['published_date'].fillna('Not Specified')

In [243]:
data_df.head()

Unnamed: 0,url,title,author,rating_stars,rating_count,reviews,description,genres,pagecount,published_date,language,book_id,format_type
0,https://www.goodreads.com/book/show/5333265-w-...,W. C. Fields: A Life on Film,Ronald J. Fields,3.86,7,1,"Provides plot summaries, cast, credits, and st...",[],256.0,"August 1, 1984",English,5333265,Paperback
1,https://www.goodreads.com/book/show/1333909.Go...,Good Harbor,Anita Diamant,3.3,8414,960,"From their first meeting, a friendship grows b...","[Fiction, Contemporary, Chick Lit, Womens, Nov...",,"January 1, 2001",English,1333909,Not Specified
2,https://www.goodreads.com/book/show/7327624-th...,The Unschooled Wizard,Barbara Hambly,3.99,158,9,Omnibus book club edition containing the Ladie...,"[Fantasy, Fiction, Own, Audiobook]",600.0,"January 1, 1987",English,7327624,Hardcover
3,https://www.goodreads.com/book/show/6066819-be...,Best Friends Forever,Jennifer Weiner,3.51,65109,4052,Addie Downs and Valerie Adler were eight when ...,"[Chick Lit, Fiction, Romance, Contemporary, Ad...",368.0,"July 14, 2009",English,6066819,Hardcover
4,https://www.goodreads.com/book/show/287140.Run...,Runic Astrology: Starcraft and Timekeeping in ...,Nigel Pennick,3.33,21,6,"Book by Pennick, Nigel",[],272.0,"January 1, 1990",English,287140,Paperback


### Load data to the database

In [244]:
# Saving clean data in a file.
import json
data_list = data_df.to_dict(orient="records")

with open("../data/goodreads_books_clean.json", "w") as file:
    json.dump(data_list, file, indent=4)