# Importing my own Books from Goodreads export tool
Goodreads export using: https://www.goodreads.com/review/import


In [8]:
# Improt packages
import pandas as pd
import pyarrow
import json 
import numpy as np

# Import functions from apps folder
from apps.collect_data import *

pd.set_option('max_colwidth', 50)
pd.set_option('display.max_columns', 80)

In [2]:
mybooksgr = pd.read_csv("assets/goodreads_library_export.csv")
mybooksgr = mybooksgr.rename(columns=lambda x: x.replace(' ', '_'))


In [3]:
# Collecting data from GBApi and OLApi
apimydf = book_info_add(mybooksgr)

In [4]:
# Merge dataframes 
mybooks = pd.merge(mybooksgr,
                     apimydf,
                     on='Title', 
                     suffixes = ('_Goodreads', '_GoogleBooks'), 
                     how='left')

# Data cleaning books df
- page count categories
- filter if book is read of not 

In [5]:
# Page count category variable

def categorize_pages(number_of_pages):
    if number_of_pages >= 100 and number_of_pages <= 249:
        return '100-249'
    elif number_of_pages >= 250 and number_of_pages <= 349:
        return '250-349'
    elif number_of_pages >= 350 and number_of_pages <= 449:
        return '350-449'
    elif number_of_pages >= 450 and number_of_pages <= 599:
        return '450-599'
    elif number_of_pages >= 600 and number_of_pages <= 749:
        return '600-749'
    elif number_of_pages >= 750 and number_of_pages <= 999:
        return '750-999'
    else:
        return '1000+'

# Apply the categorize_pages function to create the 'Page_Cat' column
mybooks['Page_Cat'] = mybooks['Number_of_Pages'].apply(categorize_pages)

# Define the desired order of categories
category_order = ['100-249', '250-349', '350-449', '450-599', '600-749', '750-999', '1000+']

# Convert the 'Page_Cat' column to a categorical variable with the specified order
mybooks['Page_Cat'] = pd.Categorical(mybooks['Page_Cat'], categories=category_order, ordered=True)


In [6]:
# drop duplicates
mybooks = mybooks.drop_duplicates(subset=['Title', 'Author'])

In [9]:
# Create year and quarter read variable 

#  Impute data_added where date_read  is na
mybooks['Date_Read'] = np.where(mybooks['Date_Read'].isnull() & mybooks['Read_Count']==1, mybooks['Date_Added'], mybooks['Date_Read'])

# Convert 'Date_Read' column to datetime type
mybooks['Date_Read'] = pd.to_datetime(mybooks['Date_Read'], format='mixed')

# Extract year and quarter from 'Date_Read' column
mybooks['Year'] = mybooks['Date_Read'].dt.year
mybooks['Quarter'] = mybooks['Date_Read'].dt.quarter

# Create a new column combining year and quarter
mybooks['Year_Quarter'] = np.where(mybooks['Date_Read'].notnull(), mybooks['Year'].astype(str) + '-Q' + mybooks['Quarter'].astype(str), np.nan)
# Replace '.0' in the Year_Quarter column with an empty string
mybooks['Year_Quarter'] = mybooks['Year_Quarter'].fillna('').str.replace('.0', '')

# Convert Year_Quarter to categorical variable
mybooks['Year_Quarter'] = pd.Categorical(mybooks['Year_Quarter'], ordered=True)

In [10]:
# filter na in publication year and make column publication year integer 
mybooks = mybooks.dropna(subset=['Original_Publication_Year'])
mybooks['Original_Publication_Year'] = mybooks['Original_Publication_Year'].astype(int)

In [None]:
# Making sure all na is set as np.nan and not as a string variable (had this issue with one variable)
import numpy as np
mybooks = mybooks.replace('nan', np.nan)
mybooks = mybooks.replace('NaN', np.nan)

In [15]:
dtypes_info = mybooks.dtypes.to_dict()
dtypes_info

{'Book_Id': dtype('int64'),
 'Title': dtype('O'),
 'Author': dtype('O'),
 'Author_l-f': dtype('O'),
 'Additional_Authors': dtype('O'),
 'ISBN_Goodreads': dtype('O'),
 'ISBN13': dtype('O'),
 'My_Rating': dtype('int64'),
 'Average_Rating_Goodreads': dtype('float64'),
 'Publisher': dtype('O'),
 'Binding': dtype('O'),
 'Number_of_Pages': dtype('float64'),
 'Year_Published': dtype('int64'),
 'Original_Publication_Year': dtype('int64'),
 'Date_Read': dtype('<M8[ns]'),
 'Date_Added': dtype('O'),
 'Bookshelves': dtype('O'),
 'Bookshelves_with_positions': dtype('O'),
 'Exclusive_Shelf': dtype('O'),
 'My_Review': dtype('float64'),
 'Spoiler': dtype('float64'),
 'Private_Notes': dtype('float64'),
 'Read_Count': dtype('int64'),
 'Owned_Copies': dtype('int64'),
 'Author(s)': dtype('O'),
 'Publish_Date': dtype('O'),
 'Description': dtype('O'),
 'ISBN_GoogleBooks': dtype('O'),
 'Page_Count': dtype('float64'),
 'Categories': dtype('O'),
 'Average_Rating_GoogleBooks': dtype('float64'),
 'Rating_Count':

In [16]:
dtypes_info = mybooks.dtypes.to_dict()

# Write the dictionary to the Python file
with open('assets/dtypes.py', 'w') as py_file:
    py_file.write(f"column_dtypes = {dtypes_info}")

In [23]:
import assets.dtypes.column_dtypes as meta

NameError: name 'dtype' is not defined

In [18]:
mybooks.to_parquet('assets/my_books.parquet')

In [None]:
df = pd.read_csv('assets/my_books.parquet', dtypes=)

# Collect book topics 

In [None]:
# Collect topics for my own books from OLapi
my_topics = get_book_topics(mybooks)

In [None]:
# Writes the topics as 
with open("assets/my_topics.json", "w") as outfile:
    json.dump(my_topics, outfile)