# üåç Air Quality ‚Äî Visualizations
This notebook generates static and interactive visualizations from the pipeline outputs (`data_lake/feature_sets/features.parquet`, `analysis_outputs/trends/*`, `analysis_outputs/forecasts/*`).

Outputs produced by this notebook:
- Static PNGs -> `analysis_outputs/figures/`
- Interactive HTML maps -> `analysis_outputs/maps/`


In [None]:
# Robust file-read helpers for the notebook
from pathlib import Path
import pandas as pd
import traceback

def try_read_parquet(path):
    """
    Return a DataFrame if parquet exists and is readable, otherwise None.
    """
    p = Path(path)
    try:
        if p.exists():
            df = pd.read_parquet(p)
            return df
        return None
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to read parquet {p}: {e}")
        traceback.print_exc(limit=1)
        return None

def try_read_csv(path, parse_dates=None, nrows_preview=0):
    """
    Read CSV if present. If parse_dates is given, inspect file columns first
    and only pass the parse_dates entries that actually exist (avoids ValueError).
    Returns DataFrame or None on missing file / failure.
    - path: Path or str
    - parse_dates: None or list/tuple/str of column(s) to parse as dates
    """
    p = Path(path)
    try:
        if not p.exists():
            return None

        # If parse_dates is provided, check which of those columns exist in the file
        cols_to_parse = None
        if parse_dates:
            # read only header to inspect column names (fast)
            preview = pd.read_csv(p, nrows=nrows_preview)
            available_cols = set(preview.columns.tolist())
            if isinstance(parse_dates, (list, tuple)):
                cols_to_parse = [c for c in parse_dates if c in available_cols]
            else:
                cols_to_parse = [parse_dates] if parse_dates in available_cols else []
        # actually read with guarded parse_dates
        if cols_to_parse:
            return pd.read_csv(p, parse_dates=cols_to_parse)
        else:
            # read without parse_dates to avoid ValueError
            return pd.read_csv(p)
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to read CSV {p}: {e}")
        traceback.print_exc(limit=1)
        return None


In [None]:
# Auto-detected paths and safe loading using the helpers above
from pathlib import Path
cwd = Path.cwd()
if (cwd / 'data_lake').exists():
    project_root = cwd
elif (cwd.parent / 'data_lake').exists():
    project_root = cwd.parent
else:
    # fallback: if PROJECT_ROOT variable already set in notebook use it
    try:
        project_root
    except NameError:
        project_root = cwd  # last resort: assume current working dir
    # we don't raise here to keep notebook non-fatal; prints follow

print('Detected project root:', project_root)

# default paths (adjust if your pipeline writes elsewhere)
features_path = project_root / 'data_lake/feature_sets/features.parquet'
trend_summary_path = project_root / 'analysis_outputs/trends/trend_summary.csv'
forecast_summary_path = project_root / 'analysis_outputs/forecasts/forecast_summary.csv'

# Use the robust helpers
df_features = try_read_parquet(features_path)
trend_summary = try_read_csv(trend_summary_path, parse_dates=['date'])
forecast_summary = try_read_csv(forecast_summary_path, parse_dates=['date'])

print('features:', 'found' if df_features is not None else 'missing')
print('trend_summary:', 'found' if trend_summary is not None else 'missing')
print('forecast_summary:', 'found' if forecast_summary is not None else 'missing')

if df_features is not None:
    print('‚úÖ features shape:', df_features.shape)
    print('Columns:', df_features.columns.tolist())

# If a CSV was read without parse_dates but you want to parse later safely:
if trend_summary is not None and 'date' in trend_summary.columns and not pd.api.types.is_datetime64_any_dtype(trend_summary['date']):
    trend_summary['date'] = pd.to_datetime(trend_summary['date'], errors='coerce')


In [3]:
# Run only if you need interactive libs (uncomment to install)
! pip install plotly folium ipywidgets python-pptx

import os
from pathlib import Path
import json
import warnings
warnings.filterwarnings("ignore")

# Create output folders
FIG_DIR = Path("analysis_outputs/figures")
MAP_DIR = Path("analysis_outputs/maps")
FIG_DIR.mkdir(parents=True, exist_ok=True)
MAP_DIR.mkdir(parents=True, exist_ok=True)

print("Figures to:", FIG_DIR.resolve())
print("Maps to:", MAP_DIR.resolve())


Collecting plotly
  Downloading plotly-6.4.0-py3-none-any.whl.metadata (8.5 kB)
Collecting ipywidgets
  Downloading ipywidgets-8.1.8-py3-none-any.whl.metadata (2.4 kB)
Collecting python-pptx
  Using cached python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting narwhals>=1.15.1 (from plotly)
  Downloading narwhals-2.11.0-py3-none-any.whl.metadata (11 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.15-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.16-py3-none-any.whl.metadata (20 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Using cached xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Collecting lxml>=3.1.0 (from python-pptx)
  Using cached lxml-6.0.2-cp311-cp311-win_amd64.whl.metadata (3.7 kB)
Downloading plotly-6.4.0-py3-none-any.whl (9.9 MB)
   ---------------------------------------- 0.0/9.9 MB ? eta -:--:--
   ------------------------

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import folium
from folium.plugins import HeatMap
from datetime import datetime

sns.set(style="whitegrid", context="talk")

def try_read_parquet(path):
    p = Path(path)
    if p.exists():
        return pd.read_parquet(p)
    return None

def try_read_csv(path, parse_dates=None):
    p = Path(path)
    if p.exists():
        return pd.read_csv(p, parse_dates=parse_dates)
    return None

def save_fig(fig, filename, dpi=150):
    out = FIG_DIR / filename
    fig.savefig(out, bbox_inches="tight", dpi=dpi)
    print("Saved:", out)


In [14]:
import os
print("CWD:", os.getcwd())

CWD: c:\Users\HP\Desktop\Capstone Project\analysis


In [15]:
from pathlib import Path

# Detect project root automatically
root = Path.cwd()
if (root / "data_lake").exists():
    project_root = root
elif (root.parent / "data_lake").exists():
    project_root = root.parent
else:
    raise FileNotFoundError("Cannot find project root (no data_lake/ folder found)")

print("Detected project root:", project_root)

# Build paths relative to project root
features_path = project_root / "data_lake/feature_sets/features.parquet"
trend_summary_path = project_root / "analysis_outputs/trends/trend_summary.csv"
forecast_summary_path = project_root / "analysis_outputs/forecasts/forecast_summary.csv"

# Load safely
df_features = try_read_parquet(features_path)
trend_summary = try_read_csv(trend_summary_path, parse_dates=['date'])
forecast_summary = try_read_csv(forecast_summary_path)

print("features:", "found" if df_features is not None else "missing")
print("trend_summary:", "found" if trend_summary is not None else "missing")
print("forecast_summary:", "found" if forecast_summary is not None else "missing")

if df_features is not None:
    print("‚úÖ features shape:", df_features.shape)
    print("Columns:", df_features.columns.tolist())
else:
    print("‚ö†Ô∏è features.parquet not found even under project root:", features_path)


Detected project root: c:\Users\HP\Desktop\Capstone Project


ValueError: Missing column provided to 'parse_dates': 'date'

In [16]:
# Robust file-read helpers for the notebook
from pathlib import Path
import pandas as pd
import traceback

def try_read_parquet(path):
    """
    Return a DataFrame if parquet exists and is readable, otherwise None.
    """
    p = Path(path)
    try:
        if p.exists():
            df = pd.read_parquet(p)
            return df
        return None
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to read parquet {p}:", e)
        traceback.print_exc(limit=1)
        return None

def try_read_csv(path, parse_dates=None, nrows_preview=0):
    """
    Read CSV if present. If parse_dates is given, inspect file columns first
    and only pass the parse_dates columns that actually exist (avoids ValueError).
    Returns DataFrame or None on missing file / failure.
      - path: Path or str
      - parse_dates: None or list/tuple/str of column(s) to parse as dates
    """
    p = Path(path)
    try:
        if not p.exists():
            return None

        # If parse_dates is provided, check which of those columns exist in the file
        cols_to_parse = None
        if parse_dates:
            # read only header to inspect column names (fast)
            preview = pd.read_csv(p, nrows=nrows_preview)  # nrows=0 returns only columns
            available_cols = set(preview.columns.tolist())
            if isinstance(parse_dates, (list, tuple)):
                cols_to_parse = [c for c in parse_dates if c in available_cols]
            else:
                cols_to_parse = [parse_dates] if parse_dates in available_cols else []
        # actually read with guarded parse_dates
        if cols_to_parse:
            return pd.read_csv(p, parse_dates=cols_to_parse)
        else:
            # read without parse_dates to avoid ValueError
            return pd.read_csv(p)
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to read CSV {p}: {e}")
        traceback.print_exc(limit=1)
        return None


In [17]:
# auto-detect project root (works if notebook runs from project root or analysis/ subfolder)
from pathlib import Path

cwd = Path.cwd()
if (cwd / "data_lake").exists():
    project_root = cwd
elif (cwd.parent / "data_lake").exists():
    project_root = cwd.parent
else:
    # fallback: allow user to set manually (set PROJECT_ROOT variable), else raise
    try:
        project_root  # if previously set by user
    except NameError:
        raise FileNotFoundError(
            "Cannot detect project root (no data_lake/ folder found). "
            "Either run notebook from project root or set project_root manually."
        )

print("Detected project root:", project_root)

# paths relative to project root
features_path = project_root / "data_lake/feature_sets/features.parquet"
trend_summary_path = project_root / "analysis_outputs/trends/trend_summary.csv"
forecast_summary_path = project_root / "analysis_outputs/forecasts/forecast_summary.csv"

# Use the robust helpers
df_features = try_read_parquet(features_path)
trend_summary = try_read_csv(trend_summary_path, parse_dates=['date'])
forecast_summary = try_read_csv(forecast_summary_path, parse_dates=['date'])

print("features:", "found" if df_features is not None else "missing")
print("trend_summary:", "found" if trend_summary is not None else "missing")
print("forecast_summary:", "found" if forecast_summary is not None else "missing")

if df_features is not None:
    print("‚úÖ features shape:", df_features.shape)
    print("Columns:", df_features.columns.tolist())

# If a CSV was read without parse_dates but you want to parse later:
if trend_summary is not None and 'date' in trend_summary.columns and not pd.api.types.is_datetime64_any_dtype(trend_summary['date']):
    trend_summary['date'] = pd.to_datetime(trend_summary['date'], errors='coerce')


Detected project root: c:\Users\HP\Desktop\Capstone Project
features: found
trend_summary: found
forecast_summary: found
‚úÖ features shape: (120, 7)
Columns: ['Date', 'location_inferred', 'O3_ug_m3', 'year', 'month', 'year_month', 'O3_ug_m3_rolling_60m']


In [18]:
# Paths (adjust if your pipeline writes elsewhere)
features_path = Path("data_lake/feature_sets/features.parquet")
trend_summary_path = Path("analysis_outputs/trends/trend_summary.csv")
forecast_summary_path = Path("analysis_outputs/forecasts/forecast_summary.csv")
# There may be per-location forecast CSVs: analysis_outputs/forecasts/forecast_<loc>.csv

# Load
df_features = try_read_parquet(features_path)
trend_summary = try_read_csv(trend_summary_path, parse_dates=['date'])  # may be None
forecast_summary = try_read_csv(forecast_summary_path)

print("features:", "found" if df_features is not None else "missing")
print("trend_summary:", "found" if trend_summary is not None else "missing")
print("forecast_summary:", "found" if forecast_summary is not None else "missing")

# If df_features exists, print a quick preview
if df_features is not None:
    print("features shape:", df_features.shape)
    print(df_features.columns.tolist())
    # Try to find a location column (common names used in pipeline)
    loc_candidates = [c for c in df_features.columns if c.lower().startswith("loc") or c.lower().startswith("region")]
    if loc_candidates:
        location_col = loc_candidates[0]
    else:
        # fallback columns
        location_col = 'location' if 'location' in df_features.columns else 'location_inferred' if 'location_inferred' in df_features.columns else None
    print("Using location col:", location_col)
else:
    location_col = None


features: missing
trend_summary: missing
forecast_summary: missing


In [19]:
# Basic time conversion
if df_features is not None:
    date_col = next((c for c in df_features.columns if c.lower() == 'date' or c.lower() == 'date_time' or c.lower()=='date'), 'date')
    df_features[date_col] = pd.to_datetime(df_features[date_col])
    display(df_features.head())

    # If value col not auto-known, try to detect a numeric pollutant column
    pollutant_candidates = [c for c in df_features.columns if c not in [date_col, location_col, 'year','month','year_month'] and pd.api.types.is_numeric_dtype(df_features[c])]
    print("Numeric candidates:", pollutant_candidates[:6])
    value_col = pollutant_candidates[0] if pollutant_candidates else None
    print("Using value_col:", value_col)
else:
    value_col = None


In [20]:
if df_features is None or value_col is None:
    print("‚ö†Ô∏è No feature dataset or value column found. Skipping static trend plot.")
else:
    fig, ax = plt.subplots(figsize=(12,6))
    if location_col:
        # plot each location lightly; if many locations, plot the aggregated trend
        nloc = df_features[location_col].nunique()
        if nloc <= 8:
            sns.lineplot(data=df_features, x=date_col, y=value_col, hue=location_col, ax=ax, legend='brief')
            ax.set_title(f"{value_col} by location (time)")
        else:
            # aggregate monthly global mean
            agg = df_features.groupby(date_col)[value_col].mean().reset_index()
            sns.lineplot(data=agg, x=date_col, y=value_col, ax=ax)
            ax.set_title(f"Average {value_col} (all locations)")
    else:
        sns.lineplot(data=df_features, x=date_col, y=value_col, ax=ax)
        ax.set_title(f"{value_col} (time)")

    ax.set_ylabel(f"{value_col} (units)")
    ax.set_xlabel("Date")
    plt.tight_layout()
    save_fig(fig, "timeseries_trend.png")
    plt.show()


‚ö†Ô∏è No feature dataset or value column found. Skipping static trend plot.


In [8]:
# Auto-detect per-location forecast CSVs
forecast_dir = Path("analysis_outputs/forecasts")
forecast_files = sorted(list(forecast_dir.glob("forecast_*.csv")))
print("Found forecast files:", [f.name for f in forecast_files])

if forecast_files:
    for fpath in forecast_files:
        df_f = pd.read_csv(fpath, parse_dates=['date'])
        loc = fpath.stem.replace("forecast_","")
        fig, ax = plt.subplots(figsize=(12,6))
        # plot historical + forecast if historical in features
        # Here we use forecast CSV that commonly includes 'y', 'yhat', 'yhat_lower','yhat_upper' or 'yhat'
        if 'y' in df_f.columns:
            sns.lineplot(data=df_f, x='date', y='y', ax=ax, label='hist (y)')
        if 'yhat' in df_f.columns:
            sns.lineplot(data=df_f, x='date', y='yhat', ax=ax, label='forecast (yhat)')
            if 'yhat_lower' in df_f.columns and 'yhat_upper' in df_f.columns:
                ax.fill_between(df_f['date'], df_f['yhat_lower'], df_f['yhat_upper'], alpha=0.25, label='conf interval')
        ax.set_title(f"Forecast ‚Äî {loc}")
        ax.set_xlabel("Date")
        ax.set_ylabel(value_col if value_col else 'value')
        plt.legend()
        save_fig(fig, f"forecast_{loc}.png")
        plt.show()
else:
    print("No forecast files found; skip forecast plotting.")


Found forecast files: []
No forecast files found; skip forecast plotting.


In [9]:
# Create an interactive plotly time-series. Exports as HTML and shows in notebook.
if df_features is None or value_col is None:
    print("‚ö†Ô∏è Missing data for interactive plot.")
else:
    if location_col and df_features[location_col].nunique() <= 8:
        fig = px.line(df_features, x=date_col, y=value_col, color=location_col, title=f"Interactive: {value_col} by location")
    else:
        agg = df_features.groupby(date_col)[value_col].mean().reset_index()
        fig = px.line(agg, x=date_col, y=value_col, title=f"Interactive: Mean {value_col} across locations")

    html_out = FIG_DIR / "interactive_trend.html"
    fig.write_html(str(html_out), include_plotlyjs='cdn')
    print("Saved interactive plot to:", html_out)
    fig.show()


‚ö†Ô∏è Missing data for interactive plot.


In [10]:
if forecast_files:
    for fpath in forecast_files:
        df_f = pd.read_csv(fpath, parse_dates=['date'])
        loc = fpath.stem.replace("forecast_","")
        # prefer y and yhat
        if 'yhat' in df_f.columns:
            fig = go.Figure()
            if 'y' in df_f.columns:
                fig.add_trace(go.Scatter(x=df_f['date'], y=df_f['y'], name='historical'))
            fig.add_trace(go.Scatter(x=df_f['date'], y=df_f['yhat'], name='forecast'))
            if 'yhat_lower' in df_f.columns and 'yhat_upper' in df_f.columns:
                fig.add_trace(go.Scatter(
                    x=pd.concat([df_f['date'], df_f['date'][::-1]]),
                    y=pd.concat([df_f['yhat_upper'], df_f['yhat_lower'][::-1]]),
                    fill='toself', fillcolor='rgba(0,100,80,0.2)', line=dict(color='rgba(255,255,255,0)'),
                    hoverinfo="skip", showlegend=True, name='conf interval'))
            fig.update_layout(title=f"Forecast (interactive) - {loc}", xaxis_title='date', yaxis_title=value_col)
            out_html = FIG_DIR / f"interactive_forecast_{loc}.html"
            fig.write_html(out_html, include_plotlyjs='cdn')
            print("Saved:", out_html)
        else:
            print(f"Skipping interactive forecast for {fpath.name} (no yhat column)")
else:
    print("No forecast CSVs to create interactive plots.")


No forecast CSVs to create interactive plots.


In [11]:
# We need lat/lon columns. Try common names.
lat_candidates = [c for c in (df_features.columns if df_features is not None else []) if 'lat' in c.lower()]
lon_candidates = [c for c in (df_features.columns if df_features is not None else []) if 'lon' in c.lower() or 'lng' in c.lower()]

if df_features is not None and lat_candidates and lon_candidates:
    lat_col = lat_candidates[0]
    lon_col = lon_candidates[0]
    print("Using coords:", lat_col, lon_col)

    # pick a centre
    center = [df_features[lat_col].mean(), df_features[lon_col].mean()]
    m = folium.Map(location=center, zoom_start=6)
    # Option 1: markers for last measurement per location
    latest = df_features.sort_values(date_col).groupby(location_col).tail(1)
    for _, r in latest.iterrows():
        folium.CircleMarker([r[lat_col], r[lon_col]],
                            radius=4, popup=f"{location_col}:{r[location_col]} {value_col}:{r[value_col]}",
                            fill=True).add_to(m)
    # Option 2: heatmap using all points
    heat_data = df_features[[lat_col, lon_col, value_col]].dropna().values.tolist()
    HeatMap(heat_data, radius=10).add_to(m)

    map_out = MAP_DIR / "aq_heatmap.html"
    m.save(map_out)
    print("Saved map:", map_out)
else:
    print("No lat/lon columns found in features dataset. If you have lat/lon, add columns named e.g. 'lat'/'lon'.")


No lat/lon columns found in features dataset. If you have lat/lon, add columns named e.g. 'lat'/'lon'.


In [12]:
# Save small summary files for report
summary = {
    "generated_at": datetime.utcnow().isoformat()+"Z",
    "has_features": df_features is not None,
    "n_rows_features": int(df_features.shape[0]) if df_features is not None else 0,
    "n_locations": int(df_features[location_col].nunique()) if df_features is not None and location_col else 0,
    "figures_dir": str(FIG_DIR),
    "maps_dir": str(MAP_DIR),
}
with open("analysis_outputs/visualization_manifest.json", "w") as f:
    json.dump(summary, f, indent=2)
print("Wrote visualization manifest: analysis_outputs/visualization_manifest.json")
summary


Wrote visualization manifest: analysis_outputs/visualization_manifest.json


{'generated_at': '2025-11-11T16:40:07.177027Z',
 'has_features': False,
 'n_rows_features': 0,
 'n_locations': 0,
 'figures_dir': 'analysis_outputs\\figures',
 'maps_dir': 'analysis_outputs\\maps'}

In [13]:
# Optional: create a simple PPTX containing the PNGs produced above.
try:
    from pptx import Presentation
    from pptx.util import Inches
    prs = Presentation()
    prs.slide_height = Inches(7)
    for png in sorted(FIG_DIR.glob("*.png")):
        slide = prs.slides.add_slide(prs.slide_layouts[6])  # blank
        left = top = Inches(0.5)
        pic = slide.shapes.add_picture(str(png), left, top, width=Inches(9))
    ppt_out = Path("analysis_outputs/visualizations_summary.pptx")
    prs.save(ppt_out)
    print("Saved PPTX:", ppt_out)
except Exception as e:
    print("Skipping PPTX export (python-pptx not installed or error):", e)


Saved PPTX: analysis_outputs\visualizations_summary.pptx
