# Importing and cleaning the data

In [66]:
import pandas as pd 
import json
import numpy as np


In [67]:
pd.set_option('max_colwidth', 50)
pd.set_option('display.max_columns', 80)

In [68]:
import json

# Load JSON data from a file
with open('assets/my_topics.json') as file:
    json_data = json.load(file)

# Convert JSON data to a dictionary
my_topics = dict(json_data)

# Load JSON data from a file
with open('assets/wpf_topics.json') as file:
    json_data = json.load(file)

# Convert JSON data to a dictionary
topics_wfp = dict(json_data)

In [69]:
for title, topics_list in topics_wfp.items():
    adjusted_topics = []
    for topic in topics_list:
        adjusted_topics.extend(topic.split(','))
    topics_wfp[title] = adjusted_topics


# Data cleaning books df
- page count categories
- filter if book is read of not 

In [70]:
mybooks = pd.read_parquet("assets/my_book_df.parquet")
# mybooks = mybooks.rename(columns=lambda x: x.replace(' ', '_')) 


In [71]:
def categorize_pages(number_of_pages):
    if number_of_pages >= 100 and number_of_pages <= 249:
        return '100-249'
    elif number_of_pages >= 250 and number_of_pages <= 349:
        return '250-349'
    elif number_of_pages >= 350 and number_of_pages <= 449:
        return '350-449'
    elif number_of_pages >= 450 and number_of_pages <= 599:
        return '450-599'
    elif number_of_pages >= 600 and number_of_pages <= 749:
        return '600-749'
    elif number_of_pages >= 750 and number_of_pages <= 999:
        return '750-999'
    else:
        return '1000+'

# Apply the categorize_pages function to create the 'Page_Cat' column
mybooks['Page_Cat'] = mybooks['Number_of_Pages'].apply(categorize_pages)

# Define the desired order of categories
category_order = ['100-249', '250-349', '350-449', '450-599', '600-749', '750-999', '1000+']

# Convert the 'Page_Cat' column to a categorical variable with the specified order
mybooks['Page_Cat'] = pd.Categorical(mybooks['Page_Cat'], categories=category_order, ordered=True)


In [72]:
mybooks.head()

Unnamed: 0,Book_Id,Title,Author,Author_l-f,Additional_Authors,ISBN_Goodreads,ISBN13,My_Rating,Average_Rating_Goodreads,Publisher,Binding,Number_of_Pages,Year_Published,Original_Publication_Year,Date_Read,Date_Added,Bookshelves,Bookshelves_with_positions,Exclusive_Shelf,My_Review,Spoiler,Private_Notes,Read_Count,Owned_Copies,Author(s),Publish_Date,Description,ISBN_GoogleBooks,Page_Count,Categories,Average_Rating_GoogleBooks,Rating_Count,Language,Page_Cat
0,16299,And Then There Were None,Agatha Christie,"Christie, Agatha",,"=""0312330871""","=""9780312330873""",3,4.28,St. Martin's Griffin,Paperback,264.0,2004,1939.0,,2023/07/07,,,read,,,,1,0,Agatha Christie,2017-11-25,"Ten strangers, apparently with little in commo...",9789352770250,304.0,Fiction,,,en,250-349
1,36315374,Jar of Hearts,Jennifer Hillier,"Hillier, Jennifer",,"=""1250154197""","=""9781250154194""",4,4.12,Minotaur Books,Hardcover,311.0,2018,2018.0,,2023/07/01,,,read,,,,1,0,Jennifer Hillier,2018-06-12,Nationally Bestselling Author! * Winner - Best...,9781250154217,318.0,Fiction,4.0,40.0,en,250-349
2,58724923,Hidden Pictures,Jason Rekulak,"Rekulak, Jason",,"=""1250819342""","=""9781250819345""",4,4.15,Flatiron Books,Hardcover,372.0,2022,2022.0,,2023/06/30,,,read,,,,1,0,Jason Rekulak,2022-05-10,NATIONAL BESTSELLER · OPTIONED FOR NETFLIX BY ...,9781250819369,365.0,Fiction,,,en,350-449
3,43822820,"The Family Upstairs (The Family Upstairs, #1)",Lisa Jewell,"Jewell, Lisa",,"=""1501190105""","=""9781501190100""",0,3.97,Atria Books,Hardcover,340.0,2019,2019.0,,2023/07/01,to-read,to-read (#154),to-read,,,,0,0,Lisa Jewell,2020-06-02,INSTANT NEW YORK TIMES BESTSELLER A GOOD MORNI...,9781501190117,384.0,Fiction,,,en,250-349
4,59316367,Look Closer,David Ellis,"Ellis, David",,"=""0399170928""","=""9780399170928""",0,4.29,G.P. Putnam's Sons,Hardcover,448.0,2022,2022.0,,2023/07/01,to-read,to-read (#153),to-read,,,,0,0,David Ellis,2022-07-05,"“Suspenseful, sexy, involving, twisty and twis...",9780698161993,464.0,Fiction,4.0,9.0,en,350-449


In [103]:
# Create year and quarter read variable 

#  Impute data_added where date_read  is na
mybooks['Date_Read'] = np.where(mybooks['Date_Read'].isnull() & mybooks['Read_Count']==1, mybooks['Date_Added'], mybooks['Date_Read'])

# Convert 'Date_Read' column to datetime type
mybooks['Date_Read'] = pd.to_datetime(mybooks['Date_Read'], format='mixed')

# Extract year and quarter from 'Date_Read' column
mybooks['Year'] = mybooks['Date_Read'].dt.year
mybooks['Quarter'] = mybooks['Date_Read'].dt.quarter

# Create a new column combining year and quarter
mybooks['Year_Quarter'] = np.where(mybooks['Date_Read'].notnull(), mybooks['Year'].astype(str) + '-Q' + mybooks['Quarter'].astype(str), np.nan)
# Replace '.0' in the Year_Quarter column with an empty string
mybooks['Year_Quarter'] = mybooks['Year_Quarter'].fillna('').str.replace('.0', '')

# Convert Year_Quarter to categorical variable
mybooks['Year_Quarter'] = pd.Categorical(mybooks['Year_Quarter'], ordered=True)

In [106]:
# filter na in publication year and make column publication year integer 
mybooks = mybooks.dropna(subset=['Original_Publication_Year'])
mybooks['Original_Publication_Year'] = mybooks['Original_Publication_Year'].astype(int)

In [107]:
# Making sure all na is set as np.nan and not as a string variable (had this issue with one variable)
import numpy as np
mybooks = mybooks.replace('nan', np.nan)
mybooks = mybooks.replace('NaN', np.nan)

In [108]:
# # filer only books I have read
myreads = mybooks.query("Read_Count == 1")
to_read = mybooks.query("Exclusive_Shelf == 'to-read'")

In [109]:
myreads['Date_Read'] = myreads['Date_Read'].fillna(myreads['Date_Added']).copy()
myreads = myreads.sort_values(by='Date_Read')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [78]:
# creating dictionaries for read and want-to-read books
my_read_topics = {k: v for k, v in my_topics.items() if k in myreads.Title.to_list()}
my_want_titles = list(set(list(my_topics.keys())) - set(myreads.Title.to_list()))
my_want_topics = {k: v for k, v in my_topics.items() if k in my_want_titles}

# Topics visualization 

In [79]:
from apps.viz import tree_topics

In [80]:
fig = tree_topics(my_read_topics)
fig.show()

In [81]:
fig = tree_topics(my_want_topics)
fig.show()

# vizualising publication year

In [82]:
from apps.viz import viz_pub_year
fig = viz_pub_year(myreads)
fig.show()

# Vizualising timeline of books read. 

In [110]:
from apps.viz import viz_year_read
viz_year_read(myreads).show()

# Number of pages and read count

Visualising the most read page count of books

In [111]:
from apps.viz import visualize_page_categories

# Assuming you have a DataFrame 'myreads' with 'Page_Cat' column

visualize_page_categories(myreads, 'Page_Cat')



# Vizualise top categories and languages
I am using a pie chart here because I excpect few values in each variable and a large discrepancy.

In [85]:
from apps.viz import viz_top_values

In [86]:
viz_top_values(mybooks['Language'], top_n=5)

In [87]:
viz_top_values(mybooks['Categories'], top_n=5)

## Rating visualised 

In [147]:
def plot_book_ratings(data):
    # Filter the data where My_Rating > 0 since this would include non rated books

    # Sort the filtered data by your own rating in descending order
    sorted_data = (
        data.query('My_Rating > 0')
        .drop_duplicates(subset=['Title', 'Author'])
        .sort_values(['My_Rating', 'Year'], ascending=False)
    )
    # Select the top 10 and bottom 10 books based on your own rating
    top_books = sorted_data.head(10)
    bottom_books = sorted_data.tail(10)

    # Create the figure object with subplots
    fig = make_subplots(rows=2, cols=1, subplot_titles=("My highest Rated Books", "My lowest Rated Books"))

    # Define the Pastel1 color scheme
    pastel_colors = plotly.colors.qualitative.Pastel1

    # Add traces for top rated books
    fig.add_trace(go.Bar(
        y=top_books['Title'],
        x=top_books['My_Rating'],
        name='My Rating',
        orientation='h',
        marker=dict(color=pastel_colors[3]),
        legendgroup='My Rating'
    ), row=1, col=1)

    fig.add_trace(go.Bar(
        y=top_books['Title'],
        x=top_books['Average_Rating_GoogleBooks'],
        name='Average Rating (Google Books)',
        orientation='h',
        marker=dict(color=pastel_colors[4]),
        legendgroup='Average Rating (Google Books)'
    ), row=1, col=1)

    fig.add_trace(go.Bar(
        y=top_books['Title'],
        x=top_books['Average_Rating_Goodreads'],
        name='Average Rating (Goodreads)',
        orientation='h',
        marker=dict(color=pastel_colors[5]),
        legendgroup='Average Rating (Goodreads)'
    ), row=1, col=1)

    # Add traces for bottom rated books
    fig.add_trace(go.Bar(
        y=bottom_books['Title'],
        x=bottom_books['My_Rating'],
        name='My Rating',
        orientation='h',
        marker=dict(color=pastel_colors[3]),
        legendgroup='My Rating',
        showlegend=False,
    ), row=2, col=1)

    fig.add_trace(go.Bar(
        y=bottom_books['Title'],
        x=bottom_books['Average_Rating_GoogleBooks'],
        name=' ',
        orientation='h',
        marker=dict(color=pastel_colors[4]),
        legendgroup='Average Rating (Google Books)',
        showlegend=False,
    ), row=2, col=1)

    fig.add_trace(go.Bar(
        y=bottom_books['Title'],
        x=bottom_books['Average_Rating_Goodreads'],
        name='Average Rating (Goodreads)',
        orientation='h',
        marker=dict(color=pastel_colors[5]),
        legendgroup='Average Rating (Goodreads)',
        showlegend=False,
    ), row=2, col=1)

    # Update the layout
    fig.update_layout(
        title='Book Ratings<span style="font-size: 10px;"><br>Showing the latest read books</span>',
        showlegend=True,
        height=900,
        width=800,
        plot_bgcolor='rgba(255, 255, 255, 1)',
        paper_bgcolor='rgba(255, 255, 255, 1)',
        yaxis=dict(title='Title', side='top', showticklabels=True),
        xaxis=dict(title='Rating'),
        barmode='group',
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.03,
            xanchor="right",
            x=1.3
            )
    )
    fig.show()
    print(top_books)

In [148]:
# from apps.viz import plot_book_ratings

plot_book_ratings(myreads)


      Book_Id                                              Title   
90   53799686                                     Anxious People  \
64   38357895                            Convenience Store Woman   
54    6443834                             1Q84 Book 1 (1Q84, #1)   
53    6449422                                    1Q84 (1Q84, #2)   
147   7005479                             Island Beneath the Sea   
237  40961230                      The Travelling Cat Chronicles   
210  50224049                            How to Kill Your Family   
205  22351151  The Accidental Alchemist (An Accidental Alchem...   
202  36315977                   Shadow and Bone (The Grisha, #1)   
158      9328                           The House of the Spirits   

              Author        Author_l-f     Additional_Authors ISBN_Goodreads   
90   Fredrik Backman  Backman, Fredrik           Neil   Smith  ="1982121602"  \
64     Sayaka Murata    Murata, Sayaka  Ginny Tapley Takemori            =""   
54   Haruki

In [90]:


from apps.viz import create_rating_table

# Call the function to create the rating table
rating_table = create_rating_table(myreads)

# Display the table
rating_table.show()


In [91]:
from apps.viz import create_author_table
# Call the function to create the author table
author_table = create_author_table(myreads)

# Display the table
author_table.show()

# This year in books

In [92]:
top_tbl = 

SyntaxError: invalid syntax (1222766511.py, line 1)

In [None]:
myreads.head(1)

Unnamed: 0,Book_Id,Title,Author,Author_l-f,Additional_Authors,ISBN_Goodreads,ISBN13,My_Rating,Average_Rating_Goodreads,Publisher,Binding,Number_of_Pages,Year_Published,Original_Publication_Year,Date_Read,Date_Added,Bookshelves,Bookshelves_with_positions,Exclusive_Shelf,My_Review,Spoiler,Private_Notes,Read_Count,Owned_Copies,Author(s),Publish_Date,Description,ISBN_GoogleBooks,Page_Count,Categories,Average_Rating_GoogleBooks,Rating_Count,Language,Page_Cat,Year,Quarter,Year_Quarter
0,6449422,"1Q84 (1Q84, #2)",Haruki Murakami,"Murakami, Haruki",,"=""4103534230""","=""9784103534235""",5,4.1,Shinchosha/Tsai Fong Books,Hardcover,397.0,2009,2009.0,NaT,2023/04/28,,,read,,,,1,0,"Haruki Murakami, Lica Hashimoto",,"Nesse segundo volume, duas histórias em parale...",8579622050,376.0,Japanese fiction,,,pt-BR,350-449,,,nan-Qnan


In [None]:
myreads.query()