In [2]:
import pandas as pd
import altair as alt

# 1. Loading the Data
Load the data into a Pandas DataFrame

In [3]:
# Load dataset
url = "https://raw.githubusercontent.com/melaniewalsh/responsible-datasets-in-context/main/datasets/top-500-novels/library_top_500.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,top_500_rank,title,author,pub_year,orig_lang,genre,author_birth,author_death,author_gender,author_primary_lang,...,gr_num_ratings,gr_num_reviews,gr_avg_rating_rank,gr_num_ratings_rank,oclc_owi,author_viaf,gr_url,wiki_url,pg_eng_url,pg_orig_url
0,1,Don Quixote,Miguel de Cervantes,1605,Spanish,action,1547,1616,male,spa,...,269435,12053,318,211,1810748000.0,17220427,https://www.goodreads.com/book/show/3836.Don_Q...,https://en.wikipedia.org/wiki/Don_Quixote,https://www.gutenberg.org/cache/epub/996/pg996...,https://www.gutenberg.org/cache/epub/2000/pg20...
1,2,Alice's Adventures in Wonderland,Lewis Carroll,1865,English,fantasy,1832,1898,male,eng,...,561016,15380,172,133,11561320000.0,66462036,https://www.goodreads.com/book/show/24213.Alic...,https://en.wikipedia.org/wiki/Alice%27s_Advent...,https://www.gutenberg.org/cache/epub/11/pg11.txt,
2,3,The Adventures of Huckleberry Finn,Mark Twain,1884,English,action,1835,1910,male,eng,...,1262480,19440,373,68,3373178000.0,50566653,https://www.goodreads.com/book/show/2956.The_A...,https://en.wikipedia.org/wiki/Adventures_of_Hu...,https://www.gutenberg.org/cache/epub/76/pg76.txt,
3,4,The Adventures of Tom Sawyer,Mark Twain,1876,English,action,1835,1910,male,eng,...,931898,13603,301,88,3373178000.0,50566653,https://www.goodreads.com/book/show/24583.The_...,https://en.wikipedia.org/wiki/The_Adventures_o...,https://www.gutenberg.org/cache/epub/74/pg74.txt,
4,5,Treasure Island,Robert Louis Stevenson,1883,English,action,1850,1894,male,eng,...,486155,16307,368,145,3434.0,95207986,https://www.goodreads.com/book/show/295.Treasu...,https://en.wikipedia.org/wiki/Treasure_Island,https://www.gutenberg.org/cache/epub/120/pg120...,


# 2. Data Cleaning and Preprocessing
Check for missing values and correct data types if needed.

In [7]:
# Drop columns with high missing rates and irrelevant columns
df_cleaned = df.drop(columns=['author_field_of_activity', 'pg_orig_url'])

# Convert numerical columns with commas to integers
df_cleaned['gr_num_ratings'] = df_cleaned['gr_num_ratings'].str.replace(',', '').astype(float)
df_cleaned['gr_num_reviews'] = df_cleaned['gr_num_reviews'].str.replace(',', '').astype(float)

# Convert `pub_year` to numeric type
df_cleaned['pub_year'] = pd.to_numeric(df_cleaned['pub_year'], errors='coerce')

# 3. Distribution of Novels Over Time

In [8]:
# Group by publication year
yearly_counts = df_cleaned.groupby('pub_year').size().reset_index(name='Count')

# Altair line chart for number of novels published by year
chart_pub_year = alt.Chart(yearly_counts).mark_line().encode(
    x=alt.X('pub_year:O', title='Year Published'),
    y=alt.Y('Count:Q', title='Number of Novels Published')
).properties(
    title='Number of Novels Published Over Time'
)
chart_pub_year.display()


# 4. Top Authors by Novel Count

In [9]:
# Top 10 authors by count
top_authors = df_cleaned['author'].value_counts().head(10).reset_index()
top_authors.columns = ['author', 'Count']

# Altair bar chart for top authors
chart_top_authors = alt.Chart(top_authors).mark_bar().encode(
    x=alt.X('Count:Q', title='Number of Novels'),
    y=alt.Y('author:N', sort='-x', title='Author')
).properties(
    title='Top 10 Authors by Novel Count'
)
chart_top_authors.display()


# 5. Language Distribution

In [10]:
# Language distribution
language_counts = df_cleaned['orig_lang'].value_counts().reset_index()
language_counts.columns = ['orig_lang', 'Count']

# Altair bar chart for language distribution
chart_language = alt.Chart(language_counts).mark_bar().encode(
    x=alt.X('orig_lang:N', title='Original Language'),
    y=alt.Y('Count:Q', title='Number of Novels')
).properties(
    title='Distribution of Novels by Language'
)
chart_language.display()


# 6. Genre Distribution

In [12]:
# Genre distribution
genre_counts = df_cleaned[df_cleaned['genre']!='na']['genre'].value_counts().reset_index()
genre_counts.columns = ['genre', 'Count']

# Altair bar chart for genre distribution
chart_genre = alt.Chart(genre_counts).mark_bar().encode(
    x=alt.X('genre:N', title='Genre', sort='-y'),
    y=alt.Y('Count:Q', title='Number of Novels')
).properties(
    title='Distribution of Novels by Genre'
)
chart_genre.display()


# 7. Average Ratings Analysis

In [13]:
# Histogram of average ratings
chart_avg_rating = alt.Chart(df_cleaned).mark_bar().encode(
    x=alt.X('gr_avg_rating:Q', bin=alt.Bin(maxbins=20), title='Average Rating'),
    y=alt.Y('count()', title='Number of Novels')
).properties(
    title='Distribution of Average Ratings'
)
chart_avg_rating.display()


# 8. Genre Popularity Over Time

In [16]:
# Count the number of books in each genre by publication year
genre_year_counts = df_cleaned[df_cleaned['genre']!='na'].groupby(['pub_year', 'genre']).size().reset_index(name='Count')

# Altair area chart to show genre trends over time
chart_genre_trend = alt.Chart(genre_year_counts).mark_area().encode(
    x=alt.X('pub_year:O', title='Year Published'),
    y=alt.Y('Count:Q', title='Number of Novels'),
    color=alt.Color('genre:N', title='Genre')
).properties(
    title='Trends in Genre Popularity Over Time'
)
chart_genre_trend.display()


# 9. Relationship Between Ratings and Number of Ratings

In [15]:
# Altair scatter plot for relationship between ratings and number of ratings
chart_rating_correlation = alt.Chart(df_cleaned).mark_circle(size=60).encode(
    x=alt.X('gr_num_ratings:Q', title='Number of Ratings'),
    y=alt.Y('gr_avg_rating:Q', title='Average Rating'),
    tooltip=['title', 'author', 'gr_avg_rating', 'gr_num_ratings']
).properties(
    title='Relationship Between Average Rating and Number of Ratings'
).interactive()  # Add interactivity for deeper exploration
chart_rating_correlation.display()
