# Importing my own Books from Goodreads export tool
Goodreads export using: https://www.goodreads.com/review/import


In [2]:
# Improt packages
import pandas as pd
import pyarrow
import json 
import numpy as np

# Import functions from apps folder
from apps.collect_data import *

pd.set_option('max_colwidth', 50)
pd.set_option('display.max_columns', 80)

In [27]:
mybooksgr = pd.read_csv("assets/goodreads_library_export.csv")
mybooksgr = mybooksgr.rename(columns=lambda x: x.replace(' ', '_'))


In [31]:
mybooksgr.query('Author.str.contains("Freida")==True')

Unnamed: 0,Book_Id,Title,Author,Author_l-f,Additional_Authors,ISBN,ISBN13,My_Rating,Average_Rating,Publisher,Binding,Number_of_Pages,Year_Published,Original_Publication_Year,Date_Read,Date_Added,Bookshelves,Bookshelves_with_positions,Exclusive_Shelf,My_Review,Spoiler,Private_Notes,Read_Count,Owned_Copies
12,123423832,The Coworker,Freida McFadden,"McFadden, Freida",,"=""172829620X""","=""9781728296203""",0,3.95,Poisoned Pen Press,Paperback,362.0,2023,2023.0,,2023/08/25,to-read,to-read (#179),to-read,,,,0,0
20,62080187,Never Lie,Freida McFadden,"McFadden, Freida",,"=""""","=""""",0,4.17,Hollywood Upstairs Press,Kindle Edition,286.0,2022,2022.0,,2023/08/21,to-read,to-read (#172),to-read,,,,0,0
29,60556912,"The Housemaid (The Housemaid, #1)",Freida McFadden,"McFadden, Freida",,"=""1803144378""","=""""",5,4.35,Bookouture,Kindle Edition,329.0,2022,2022.0,,2023/06/17,,,read,,,,1,0
69,61149872,The Inmate,Freida McFadden,"McFadden, Freida",,"=""""","=""""",0,4.12,Hollywood Upstairs Press,Kindle Edition,388.0,2022,2022.0,,2023/07/01,to-read,to-read (#139),to-read,,,,0,0
74,52636419,The Wife Upstairs,Freida McFadden,"McFadden, Freida",,"=""""","=""""",4,4.21,Hollywood Upstairs Press,Kindle Edition,419.0,2020,2020.0,,2023/06/24,,,read,,,,1,0
79,62848145,"The Housemaid's Secret (The Housemaid, #2)",Freida McFadden,"McFadden, Freida",,"=""1837901317""","=""9781837901319""",4,4.3,Bookouture,Kindle Edition,318.0,2023,,,2023/06/17,,,read,,,,1,0


In [32]:
# Collecting data from GBApi and OLApi
apimydf = book_info_add(mybooksgr)

In [33]:
# Merge dataframes 
mybooks = pd.merge(mybooksgr,
                     apimydf,
                     on='Title', 
                     suffixes = ('_Goodreads', '_GoogleBooks'), 
                     how='left')

# Data cleaning books df
- page count categories
- filter if book is read of not 

In [35]:
# Page count category variable

def categorize_pages(number_of_pages):
    if number_of_pages >= 100 and number_of_pages <= 249:
        return '100-249'
    elif number_of_pages >= 250 and number_of_pages <= 349:
        return '250-349'
    elif number_of_pages >= 350 and number_of_pages <= 449:
        return '350-449'
    elif number_of_pages >= 450 and number_of_pages <= 599:
        return '450-599'
    elif number_of_pages >= 600 and number_of_pages <= 749:
        return '600-749'
    elif number_of_pages >= 750 and number_of_pages <= 999:
        return '750-999'
    else:
        return '1000+'

# Apply the categorize_pages function to create the 'Page_Cat' column
mybooks['Page_Cat'] = mybooks['Number_of_Pages'].apply(categorize_pages)

# Define the desired order of categories
category_order = ['100-249', '250-349', '350-449', '450-599', '600-749', '750-999', '1000+']

# Convert the 'Page_Cat' column to a categorical variable with the specified order
mybooks['Page_Cat'] = pd.Categorical(mybooks['Page_Cat'], categories=category_order, ordered=True)


In [36]:
# drop duplicates
mybooks = mybooks.drop_duplicates(subset=['Title', 'Author'])

In [38]:
# Create year and quarter read variable 

#  Impute data_added where date_read  is na
mybooks['Date_Read'] = np.where(mybooks['Date_Read'].isnull() & mybooks['Read_Count']==1, mybooks['Date_Added'], mybooks['Date_Read'])

# Convert 'Date_Read' column to datetime type
mybooks['Date_Read'] = pd.to_datetime(mybooks['Date_Read'], format='mixed')

# Extract year and quarter from 'Date_Read' column
mybooks['Year'] = mybooks['Date_Read'].dt.year
mybooks['Quarter'] = mybooks['Date_Read'].dt.quarter

# Create a new column combining year and quarter
mybooks['Year_Quarter'] = np.where(mybooks['Date_Read'].notnull(), mybooks['Year'].astype(str) + '-Q' + mybooks['Quarter'].astype(str), np.nan)
# Replace '.0' in the Year_Quarter column with an empty string
mybooks['Year_Quarter'] = mybooks['Year_Quarter'].fillna('').str.replace('.0', '')

# Convert Year_Quarter to categorical variable
mybooks['Year_Quarter'] = pd.Categorical(mybooks['Year_Quarter'], ordered=True)

In [41]:
# filter na in publication year and make column publication year integer 
mybooks['Original_Publication_Year'] = mybooks['Original_Publication_Year'].fillna( 0)
mybooks['Original_Publication_Year'] = mybooks['Original_Publication_Year'].astype(int)

In [43]:
# Making sure all na is set as np.nan and not as a string variable (had this issue with one variable)
import numpy as np
mybooks = mybooks.replace('nan', np.nan)
mybooks = mybooks.replace('NaN', np.nan)

In [44]:
import pickle
mybooks.to_pickle("assets/my_books.pkl")

# Collect book topics 

In [21]:
# Collect topics for my own books from OLapi
my_topics = get_book_topics(mybooks)

In [22]:
# Writes the topics as 
with open("assets/my_topics.json", "w") as outfile:
    json.dump(my_topics, outfile)