In [143]:
"""
Name: Jose Juan Gonzalez
Library: Altair
URL: https://altair-viz.github.io/
Description:
This library is a declarative visualization library for Python that allows the user to 
create clean, interactive, and reproducible visualizations with minimal boilerplate.
"""

'\nName: Jose Juan Gonzalez\nLibrary: Altair\nURL: https://altair-viz.github.io/\nDescription:\nThis library is a declarative visualization library for Python that allows the user to \ncreate clean, interactive, and reproducible visualizations with minimal boilerplate.\n'

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import altair as alt
from vega_datasets import data

In [88]:
# Load data
def load_data(path):
    df = pd.read_csv(path)
    df.columns = (
        df.columns.str.strip()
        .str.lower() 
        .str.replace(' ', '_') 
        .str.replace('.', '', regex=False)
    )
    df['date'] = pd.to_datetime(df['date'])
    return df

#code above makes names lowercase, removes leading and trailing spaces
#dots are also removed
#this prevents errors in filtering, grouping, etc

#trying it out? have it all in the same folder
file_path = "daily_aqi_by_county_2025.csv"
df = load_data(file_path)
df

Unnamed: 0,state_name,county_name,state_code,county_code,date,aqi,category,defining_parameter,defining_site,number_of_sites_reporting
0,Alabama,Baldwin,1,3,2025-01-01,20,Good,PM2.5,01-003-0010,1
1,Alabama,Baldwin,1,3,2025-01-02,37,Good,PM2.5,01-003-0010,1
2,Alabama,Baldwin,1,3,2025-01-03,52,Moderate,PM2.5,01-003-0010,1
3,Alabama,Baldwin,1,3,2025-01-04,31,Good,PM2.5,01-003-0010,1
4,Alabama,Baldwin,1,3,2025-01-05,31,Good,PM2.5,01-003-0010,1
...,...,...,...,...,...,...,...,...,...,...
105864,Wyoming,Washakie,56,43,2025-06-26,25,Good,PM2.5,56-043-0002,1
105865,Wyoming,Washakie,56,43,2025-06-27,18,Good,PM2.5,56-043-0002,1
105866,Wyoming,Washakie,56,43,2025-06-28,20,Good,PM2.5,56-043-0002,1
105867,Wyoming,Washakie,56,43,2025-06-29,19,Good,PM2.5,56-043-0002,1


In [19]:
#show list of states in dataframe
print("Available states:", sorted(df['state_name'].unique()))
selected_state = input("Select a state (or type 'All'): ")

Available states: ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Country Of Mexico', 'Delaware', 'District Of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']


Select a state (or type 'All'):  


In [25]:
#summary metrics
df.describe()

Unnamed: 0,state_code,county_code,date,aqi,number_of_sites_reporting
count,105869.0,105869.0,105869,105869.0,105869.0
mean,29.594792,83.125362,2025-03-11 12:07:30.896862720,40.863756,1.889958
min,1.0,1.0,2025-01-01 00:00:00,0.0,1.0
25%,17.0,23.0,2025-02-03 00:00:00,31.0,1.0
50%,29.0,59.0,2025-03-07 00:00:00,40.0,1.0
75%,42.0,111.0,2025-04-10 00:00:00,49.0,2.0
max,80.0,810.0,2025-07-02 00:00:00,2122.0,32.0
std,15.909772,99.621208,,19.641668,2.351933


In [134]:
#average AQI by state
state_aqi = df.groupby('state_name', as_index=False)['aqi'].mean()

bar_chart = (
    alt.Chart(state_aqi) #initiates an altair chart with a variable
    .mark_bar() #defines the chart type, we can explore other chart types
    .encode( #maps data columns to visual properties
        x=alt.X('aqi:Q', title='Average AQI'), #plotting x, title
        y=alt.Y('state_name:N', sort='-x', title='State'), #plotting y, title
        tooltip=['state_name', 'aqi'] #shows data on these data points when you hover with mouse
    )
    .properties(height=500) #lets you set scale of charts, width can also be set
)

#you'll notice aqi:Q, state_name:N
#this tells altair the dype of data, being Quantitative (numbers), Nominal (names), Ordinal (categorical), TEemporal (dates)

bar_chart

In [124]:
#Top 15 Most Polluted Counties

county_aqi = df.groupby(['state_name', 'county_name'], as_index=False)['aqi'].mean()
top15 = county_aqi.nlargest(15, 'aqi')
#logically, if we change largest in the function above, we can see other types of data

top_chart = (
    alt.Chart(top15) 
    .mark_bar(color='red')
    .encode(
        x=alt.X('aqi:Q', title='Average AQI'),
        y=alt.Y('county_name:N', sort='-x', title='County'),
        tooltip=['state_name', 'county_name', 'aqi']
    )
    .properties(height=400)
)

top_chart

In [98]:
#aqi distribution histogram
filtered200_df = df[df['aqi'] <= 200] #we are filtering to not chart outliers

# Sample 4999 rows randomly if dataset is larger than 5000
#this because altair struggles with plotting more than 5000 rows
if len(filtered200_df) > 4999:
    filtered200_sample = filtered200_df.sample(4999, random_state=42) #this is an arbitrary number, fixed seed
else:
    filtered200_sample = filtered200_df

hist = alt.Chart(filtered200_sample).mark_bar().encode(
    y=alt.Y('aqi:Q',
            bin=alt.Bin(maxbins=30),
            title='AQI'
    ),
    x=alt.X('count()', title='Number of Records'),
    tooltip=['count()']
).properties(
    title='AQI Distribution (Filtered AQI ≤ 200, Sampled 4999 rows)',
    width=700,
    height=500
)

hist

In [136]:
# scatterplot with regression line
# Filter extreme AQI values for readability
filtered_df = df[df['aqi'] <= 800]

# Sample 4999 rows if dataset is larger (Altair row limit)
if len(filtered_df) > 4999:
    scatter_df = filtered_df.sample(4999, random_state=42)
else:
    scatter_df = filtered_df.copy()

# Scatter plot
scatter = alt.Chart(scatter_df).mark_circle(size=60, opacity=0.5).encode(
    x=alt.X('number_of_sites_reporting:Q', title='Number of Sites Reporting'),
    y=alt.Y('aqi:Q', title='Air Quality Index', scale=alt.Scale(domain=[0, 300])), #scale to make chart readable
    color=alt.Color('category:N', title='AQI Category'), #determines color based on each point in column in dataset
    tooltip=['state_name', 'county_name', 'aqi', 'number_of_sites_reporting']
).properties(
    title='Correlation Between Number of Sites Reporting and AQI',
    width=700,
    height=500
)

# Add regression line
regression = scatter.transform_regression(
    'number_of_sites_reporting', 'aqi', method='linear'
).mark_line(color='red')

# Display the chart (scatter + regression)
scatter + regression

In [112]:
# Boxplot by Category
if len(df) > 4999:
    box_df = df.sample(4999, random_state=42)
else:
    box_df = df

#choosing the order from worst to best AQI
category_order = [
    'Unhealthy',
    'Unhealthy for Sensitive Groups',
    'Moderate',
    'Good'
]

#the chart background might be a little too tall for the box plot
box = (
    alt.Chart(box_df)
    .mark_boxplot()
    .encode(
        x=alt.X('category:N', sort=category_order, title='AQI Category'),
        y=alt.Y('aqi:Q', title='AQI', scale=alt.Scale(domain=[0, 300])),
        color='category:N'
    ).properties(width=700)
)
box

In [69]:
#Setting up Choropleth Maps
#this is with Vega Datasets data, which is seen in the Altair readme for this type of map
df['fips'] = df['state_code'].astype(str).str.zfill(2) + df['county_code'].astype(str).str.zfill(3) #fips is the code convention for states and counties
county_avg = df.groupby('fips', as_index=False)['aqi'].mean()
county_avg.columns = ['id', 'aqi']

#TopoJSON gives the geometry for states and counties, ID in dataset must match ID in TopoJSON
states = alt.topo_feature(data.us_10m.url, 'states')
counties = alt.topo_feature(data.us_10m.url, 'counties')

In [71]:
#setting up state average data
state_avg = df.groupby(['state_name', 'state_code'], as_index=False)['aqi'].mean()
state_avg.columns = ['State', 'id', 'aqi']

In [120]:
#average AQI by state
#one of the coolest features from altair
state_map = alt.Chart(states).mark_geoshape().encode(
    color=alt.Color('aqi:Q', title='Average AQI', scale=alt.Scale(scheme='yellowgreenblue')), #color schale maps visuals to intensity
    tooltip=[alt.Tooltip('State:N'), alt.Tooltip('aqi:Q')]
).transform_lookup( #transform_lookup() merges AQI data with geographic features
    lookup='id',
    from_=alt.LookupData(state_avg, key='id', fields=['State', 'aqi'])
).project('albersUsa').properties(
    title='Average AQI by State (2025)',
    width=800,
    height=500
)

state_map

In [79]:
#average AQI by county
county_map = alt.Chart(counties).mark_geoshape().encode(
    color=alt.Color('aqi:Q', title='Average AQI', scale=alt.Scale(scheme='yellowgreenblue')),
    tooltip=[alt.Tooltip('id:N', title='FIPS Code'), alt.Tooltip('aqi:Q')]
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(county_avg, key='id', fields=['aqi'])
).project('albersUsa').properties(
    title='Average AQI by County (2025)',
    width=800,
    height=500
)

county_map

In [145]:
"""
Built using Streamlit and Altair.
Data Source: [EPA Air Quality Data](https://www.epa.gov/outdoor-air-quality-data).
"""

'\nBuilt using Streamlit and Altair.\nData Source: [EPA Air Quality Data](https://www.epa.gov/outdoor-air-quality-data).\n'