In [9]:
import pandas as pd
import altair as alt
from altair import datum

# Load the Dataset
file_path = 'Air_Quality_Data_2012_2024.csv'
try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
    print(f"Initial number of rows: {len(df)}")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please ensure it's in the correct directory.")
    exit()

# Data Pre-processing and Filtering

# Convert 'Date' column to datetime objects
df['Date'] = pd.to_datetime(df['Date'])
print("Converted 'Date' column to datetime.")

# Derive additional temporal features that might be useful later
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.dayofweek 
df['DayName'] = df['Date'].dt.day_name()

# Define EPA AQI breakpoints and create 'AQI_Category' column
# This function maps numerical AQI to standard categories
def get_aqi_category(aqi_value):
    if 0 <= aqi_value <= 50:
        return 'Good'
    elif 51 <= aqi_value <= 100:
        return 'Moderate'
    elif 101 <= aqi_value <= 150:
        return 'Unhealthy for Sensitive Groups'
    elif 151 <= aqi_value <= 200:
        return 'Unhealthy'
    elif 201 <= aqi_value <= 300:
        return 'Very Unhealthy'
    elif aqi_value > 300:
        return 'Hazardous'
    else:
        return 'Invalid'
df['AQI_Category'] = df['Aqi'].apply(get_aqi_category)
print("Created 'AQI_Category' column.")

# Define a specific order for AQI categories for consistent plotting
aqi_category_order = [
    'Good',
    'Moderate',
    'Unhealthy for Sensitive Groups',
    'Unhealthy',
    'Very Unhealthy',
    'Hazardous'
]

# Define a consistent color scheme for AQI categories
aqi_category_colors = alt.Scale(
    domain=aqi_category_order,
    range=['#00E400', '#FFFF00', '#FF7E00', '#FF0000', '#8F3F97', '#7E0023']
)


# Filter the dataset to a manageable size for Altair
# Focusing on Los Angeles, New York, and Chicago for December 2023
selected_cities = ['Los Angeles', 'New York', 'Chicago']
selected_year = 2024
selected_month = 12 

df_filtered = df[
    df['City'].isin(selected_cities) &
    (df['Year'] == selected_year) &
    (df['Month'] == selected_month)
].copy()

# Drop rows where 'Aqi' might be NaN after filtering, though your dataset seems clean
df_filtered.dropna(subset=['Aqi'], inplace=True)

print(f"Filtered dataset to cities: {selected_cities} for December {selected_year}")
print(f"Number of rows in the filtered dataset: {len(df_filtered)}")
print(df_filtered.head())
print(df_filtered.info())


# 3. Altair Visualization for Compare Overall Air Quality (AQI) Across Cities Over Time ---

# Create a selection parameter for the date range slider
# This parameter will control the visible date range in the main chart
date_selection_brush = alt.selection_interval(encodings=['x'], name='dateBrush')


# Base chart definition for the main plots
# Now, the `base` chart does not include the selection parameter directly,
# as it's typically added to the interactive chart itself.
base_main = alt.Chart(df_filtered).encode(
    x=alt.X('Date:T', title='Date'),
    tooltip=[
        alt.Tooltip('Date:T', title='Date'),
        alt.Tooltip('City:N', title='City'),
        alt.Tooltip('Aqi:Q', title='AQI'),
        alt.Tooltip('AQI_Category:N', title='AQI Category') 
    ]
).properties(
    title=f'Daily Air Quality Index (AQI) Trends by City (December {selected_year})',
)

# Main chart: AQI trend lines for each city, now using 'facet' for readability
line_chart = base_main.mark_line(point=False).encode(
    y=alt.Y('Aqi:Q', title='Air Quality Index (AQI)', scale=alt.Scale(zero=False)),
    color=alt.Color('City:N', title='City'),
    row=alt.Row('City:N', title='City', header=alt.Header(titleOrient="bottom", labelOrient="bottom")),
).properties(
    width=600,
    height=200
)

# Context chart for the date range slider 
# This chart helps users select a date range that then filters the main faceted chart
context_chart = alt.Chart(df_filtered).mark_line().encode(
    x=alt.X('Date:T', title='Drag to select a date range (Daily)', axis=alt.Axis(format="%d")),
    y=alt.Y('Aqi:Q', title='Average AQI', axis=None),
    color=alt.Color('City:N', title='City'),
).add_params(
    date_selection_brush
).properties(
    height=80,
    width=600
).resolve_scale(
    y='independent'
)

# Combine the main chart and the context chart
# The main line_chart will be filtered by the 'dateBrush' selection from the context_chart
final_chart = alt.vconcat(
    line_chart.transform_filter(
        date_selection_brush
    ),
    context_chart
).resolve_scale(
    y='shared'
).interactive() #


# Display the chart
final_chart


Dataset loaded successfully.
Initial number of rows: 23745
Converted 'Date' column to datetime.
Created 'AQI_Category' column.
Filtered dataset to cities: ['Los Angeles', 'New York', 'Chicago'] for December 2024
Number of rows in the filtered dataset: 93
       Pm2.5    Pm10    No2    So2    Co  Aqi       Date      City  Year  \
4718   70.15  198.75  33.99  40.27  5.70  124 2024-12-01  New York  2024   
4719  146.72   38.11  87.58  32.76  2.42  282 2024-12-02  New York  2024   
4720  142.48  221.12  13.54  25.87  8.75  382 2024-12-03  New York  2024   
4721   67.10  228.92  12.94   2.79  7.12  220 2024-12-04  New York  2024   
4722   62.33  248.32  60.08   2.90  7.93   22 2024-12-05  New York  2024   

      Month  DayOfWeek    DayName                    AQI_Category  
4718     12          6     Sunday  Unhealthy for Sensitive Groups  
4719     12          0     Monday                  Very Unhealthy  
4720     12          1    Tuesday                       Hazardous  
4721     12     