# 04 Visualizations - Interactive Diagnostics

This notebook consumes the CSV/SHAP exports from `03_models_v2.ipynb` and surfaces interactive diagnostics with Plotly plus ipywidgets. Use the controls beside each section to drill into wards vs meshes, specific models, individual regions, and SHAP explanations for feature impact.

In [1]:
# imports plotting + widget stack, locates data, configures defaults
from __future__ import annotations

from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display
from pandas.errors import EmptyDataError

NOTEBOOK_DIR = Path.cwd()
DATA_DIR = NOTEBOOK_DIR
if not (DATA_DIR / 'model_results.csv').exists():
    candidate = NOTEBOOK_DIR / 'test_notebooks'
    if (candidate / 'model_results.csv').exists():
        DATA_DIR = candidate

EXPORTS: Dict[str, Path] = {
    'model_results': DATA_DIR / 'model_results.csv',
    'ward_predictions': DATA_DIR / 'ward_predictions_detailed.csv',
    'mesh_predictions': DATA_DIR / 'mesh_predictions_detailed.csv',
    'viz_predictions': DATA_DIR / 'model_predictions_viz.csv',
}
SHAP_DIR = DATA_DIR / 'shap_outputs'

px.defaults.template = 'plotly_white'
px.defaults.width = 940
px.defaults.height = 480
pd.set_option('display.float_format', '{:,.2f}'.format)

In [2]:
# helper loader with graceful handling for empty/missing CSVs

def load_csv(name: str, path: Path) -> pd.DataFrame:
    if path.exists():
        print(f"Loaded {name} from {path}")
        try:
            return pd.read_csv(path)
        except EmptyDataError:
            print(f"Warning: {path} is empty; skipping {name} visuals")
            return pd.DataFrame()
    print(f"Warning: {path} missing; {name} visuals will be skipped")
    return pd.DataFrame()

model_results = load_csv('model_results', EXPORTS['model_results'])
ward_predictions = load_csv('ward_predictions', EXPORTS['ward_predictions'])
mesh_predictions = load_csv('mesh_predictions', EXPORTS['mesh_predictions'])
viz_predictions = load_csv('viz_predictions', EXPORTS['viz_predictions'])

Loaded model_results from c:\Users\ignit\OneDrive\Desktop\Study\GeorgiaTech\CSE6242 - Fall 2025\Project\test_notebooks\model_results.csv
Loaded ward_predictions from c:\Users\ignit\OneDrive\Desktop\Study\GeorgiaTech\CSE6242 - Fall 2025\Project\test_notebooks\ward_predictions_detailed.csv
Loaded mesh_predictions from c:\Users\ignit\OneDrive\Desktop\Study\GeorgiaTech\CSE6242 - Fall 2025\Project\test_notebooks\mesh_predictions_detailed.csv
Loaded viz_predictions from c:\Users\ignit\OneDrive\Desktop\Study\GeorgiaTech\CSE6242 - Fall 2025\Project\test_notebooks\model_predictions_viz.csv


## Data snapshot

In [3]:
# quick glance at counts per level/model
summary_rows = []
for df, level in [(ward_predictions, 'Ward'), (mesh_predictions, 'Mesh')]:
    if df.empty:
        continue
    for model in sorted(df['Model'].unique()):
        subset = df[df['Model'] == model]
        summary_rows.append({
            'Level': level,
            'Model': model,
            'Rows': len(subset),
            'Test rows': (subset['Split'] == 'test').sum(),
        })
summary_df = pd.DataFrame(summary_rows)
summary_df

Unnamed: 0,Level,Model,Rows,Test rows
0,Ward,LightGBM,2095,364
1,Ward,LinearRegression,2095,364
2,Ward,RandomForest,2095,364
3,Ward,TorchLSTM,1955,364


## Helper utilities

In [4]:
# shared helpers for widgets and plotting
VALUE_UNITS = {
    'JPY/m^2': 1,
    'Thousand JPY/m^2': 1_000,
    'Million JPY/m^2': 1_000_000,
}
LEVEL_ID = {
    'Ward': 'Ward',
    'Mesh': 'Mesh250m',
}
LEVEL_DATA = {
    'Ward': ward_predictions,
    'Mesh': mesh_predictions,
}

if not ward_predictions.empty:
    ward_predictions['QuarterDate'] = pd.PeriodIndex(ward_predictions['PeriodKey'], freq='Q').to_timestamp()
if not mesh_predictions.empty:
    mesh_predictions['QuarterDate'] = pd.PeriodIndex(mesh_predictions['PeriodKey'], freq='Q').to_timestamp()
if not viz_predictions.empty:
    viz_predictions['QuarterDate'] = pd.PeriodIndex(viz_predictions['PeriodKey'], freq='Q').to_timestamp()
    viz_predictions['Residual'] = viz_predictions['Actual'] - viz_predictions['Predicted']

MODEL_OPTIONS = {
    level: sorted(df['Model'].unique()) if not df.empty else []
    for level, df in LEVEL_DATA.items()
}
REGION_OPTIONS = {
    level: sorted(df[LEVEL_ID[level]].dropna().unique()) if not df.empty else []
    for level, df in LEVEL_DATA.items()
}

CANONICAL_MODEL = {
    'lightgbm': 'LightGBM',
    'randomforest': 'RandomForest',
    'linearregression': 'LinearRegression',
}

def canonical_model(name: str) -> str:
    return CANONICAL_MODEL.get(name.lower(), name)

shap_summary_map: Dict[Tuple[str, str], Path] = {}
shap_local_map: Dict[Tuple[str, str], Path] = {}
if SHAP_DIR.exists():
    for path in SHAP_DIR.glob('*_shap_summary.csv'):
        stem_parts = path.stem.split('_')
        if len(stem_parts) < 2:
            continue
        level, model = stem_parts[0], stem_parts[1]
        shap_summary_map[(level.title(), canonical_model(model))] = path
    for path in SHAP_DIR.glob('*_shap_local.csv'):
        stem_parts = path.stem.split('_')
        if len(stem_parts) < 2:
            continue
        level, model = stem_parts[0], stem_parts[1]
        shap_local_map[(level.title(), canonical_model(model))] = path

## 1. Metric leaderboard

In [5]:
# interactive bar chart comparing test metrics per level
if model_results.empty:
    print('Model results missing; rerun 03_models_v2.ipynb.')
else:
    test_metrics = [c for c in model_results.columns if c.startswith('test_')]
    level_toggle = widgets.ToggleButtons(options=['Ward', 'Mesh'], description='Level')
    metric_dropdown = widgets.Dropdown(options=test_metrics, description='Metric')
    metric_output = widgets.Output()

    def render_metric(_=None):
        with metric_output:
            metric_output.clear_output()
            level = level_toggle.value
            metric_col = metric_dropdown.value
            subset = model_results[model_results['Level'] == level]
            if subset.empty:
                print(f'No rows for {level}')
                return
            fig = px.bar(
                subset,
                x='Model',
                y=metric_col,
                color='Model',
                text=metric_col,
                title=f'{level} models | {metric_col.replace("test_", "").upper()} (test split)'
            )
            fig.update_traces(texttemplate='%{text:,.0f}', selector=dict(type='bar'))
            fig.update_layout(yaxis_title=metric_col)
            fig.show()

    level_toggle.observe(render_metric, names='value')
    metric_dropdown.observe(render_metric, names='value')
    render_metric()
    display(widgets.HBox([level_toggle, metric_dropdown]))
    display(metric_output)

HBox(children=(ToggleButtons(description='Level', options=('Ward', 'Mesh'), value='Ward'), Dropdown(descriptio…

Output()

## 2. Time-series explorer

In [6]:
# drill into a ward/mesh with selectable model, unit, and split
if ward_predictions.empty and mesh_predictions.empty:
    print('Predictions missing; rerun 03_models_v2.ipynb.')
else:
    ts_level = widgets.ToggleButtons(options=['Ward', 'Mesh'], description='Level')
    ts_model = widgets.Dropdown(description='Model')
    ts_region = widgets.Dropdown(description='Region')
    ts_split = widgets.Dropdown(options=['train', 'val', 'test'], value='test', description='Split')
    ts_unit = widgets.Dropdown(options=list(VALUE_UNITS.keys()), value='Thousand JPY/m^2', description='Unit')
    ts_output = widgets.Output()

    def update_model_options(*args):
        level = ts_level.value
        ts_model.options = MODEL_OPTIONS.get(level, [])
        if ts_model.options:
            ts_model.value = ts_model.options[0]
        else:
            ts_model.value = None

    def update_region_options(*args):
        level = ts_level.value
        opts = REGION_OPTIONS.get(level, [])
        ts_region.options = opts
        if opts:
            ts_region.value = opts[0]
        else:
            ts_region.value = None

    ts_level.observe(update_model_options, names='value')
    ts_level.observe(update_region_options, names='value')
    update_model_options()
    update_region_options()

    def render_timeseries(_=None):
        with ts_output:
            ts_output.clear_output()
            level = ts_level.value
            df = LEVEL_DATA.get(level)
            if df is None or df.empty:
                print(f'No predictions for {level}')
                return
            model = ts_model.value
            region = ts_region.value
            if model is None or region is None:
                print('Select a model/region to plot')
                return
            split = ts_split.value
            unit = ts_unit.value
            level_col = LEVEL_ID[level]
            focus = df[(df['Model'] == model) & (df[level_col] == region) & (df['Split'] == split)].copy()
            if focus.empty:
                print('No rows for this combination. Try another region/model.')
                return
            focus = focus.sort_values('QuarterDate')
            scale = VALUE_UNITS[unit]
            focus['ActualScaled'] = focus['Actual'] / scale
            focus['PredScaled'] = focus['Predicted'] / scale
            fig = go.Figure()
            fig.add_trace(go.Scatter(x=focus['QuarterDate'], y=focus['ActualScaled'], mode='lines+markers', name='Actual'))
            fig.add_trace(go.Scatter(x=focus['QuarterDate'], y=focus['PredScaled'], mode='lines+markers', name='Predicted'))
            fig.update_layout(
                title=f'{region} | {model} | {split} split',
                xaxis_title='Quarter',
                yaxis_title=unit,
                legend_orientation='h'
            )
            fig.show()

    for widget_control in [ts_model, ts_region, ts_split, ts_unit]:
        widget_control.observe(render_timeseries, names='value')
    ts_level.observe(render_timeseries, names='value')
    render_timeseries()
    controls = widgets.HBox([ts_level, ts_model, ts_region, ts_split, ts_unit])
    display(controls)
    display(ts_output)

HBox(children=(ToggleButtons(description='Level', options=('Ward', 'Mesh'), value='Ward'), Dropdown(descriptio…

Output()

## 3. Actual vs predicted scatter

In [7]:
# compare actual vs predicted values with drilldown controls
if ward_predictions.empty and mesh_predictions.empty:
    print('Predictions missing; rerun 03_models_v2.ipynb.')
else:
    scatter_level = widgets.ToggleButtons(options=['Ward', 'Mesh'], description='Level')
    scatter_model = widgets.Dropdown(description='Model')
    scatter_split = widgets.Dropdown(options=['train', 'val', 'test'], value='test', description='Split')
    scatter_color = widgets.ToggleButtons(options=['Region', 'Residual'], description='Color by', value='Residual')
    scatter_sample = widgets.IntSlider(min=500, max=5000, step=500, value=2000, description='Sample N')
    scatter_output = widgets.Output()
    scatter_stats = widgets.Output()

    def update_scatter_model(*args):
        level = scatter_level.value
        scatter_model.options = MODEL_OPTIONS.get(level, [])
        if scatter_model.options:
            scatter_model.value = scatter_model.options[0]
        else:
            scatter_model.value = None

    scatter_level.observe(update_scatter_model, names='value')
    update_scatter_model()

    def render_scatter(_=None):
        with scatter_output:
            scatter_output.clear_output()
            level = scatter_level.value
            model = scatter_model.value
            if model is None:
                print('Select a model to continue')
                return
            split = scatter_split.value
            df = LEVEL_DATA.get(level)
            if df is None or df.empty:
                print(f'No predictions for {level}')
                return
            focus = df[(df['Model'] == model) & (df['Split'] == split)].copy()
            if focus.empty:
                print('No rows for this selection')
                return
            if len(focus) > scatter_sample.value:
                focus = focus.sample(scatter_sample.value, random_state=42)
            focus['Residual'] = focus['Actual'] - focus['Predicted']
            color_arg = LEVEL_ID[level] if scatter_color.value == 'Region' else 'Residual'
            fig = px.scatter(
                focus,
                x='Actual',
                y='Predicted',
                color=color_arg,
                hover_data=['PeriodKey', LEVEL_ID[level]],
                title=f'{level} {model} | {split} split',
                labels={'Actual': 'Actual (JPY/m^2)', 'Predicted': 'Predicted (JPY/m^2)'}
            )
            fig.add_trace(go.Scatter(x=focus['Actual'], y=focus['Actual'], mode='lines', name='Ideal', line=dict(color='black', dash='dash')))
            fig.show()
        with scatter_stats:
            scatter_stats.clear_output()
            if focus.empty:
                return
            diffs = focus['Actual'] - focus['Predicted']
            metrics = pd.DataFrame({
                'MAE (JPY/m^2)': [np.abs(diffs).mean()],
                'RMSE': [np.sqrt(np.mean(diffs ** 2))],
                'Bias (Actual - Predicted)': [diffs.mean()],
                'Sample size': [len(focus)]
            })
            display(metrics)

    for widget_control in [scatter_model, scatter_split, scatter_color, scatter_sample]:
        widget_control.observe(render_scatter, names='value')
    scatter_level.observe(render_scatter, names='value')
    render_scatter()
    display(widgets.VBox([widgets.HBox([scatter_level, scatter_model, scatter_split]), widgets.HBox([scatter_color, scatter_sample])]))
    display(scatter_output)
    display(scatter_stats)



HBox(children=(ToggleButtons(description='Level', options=('Ward', 'Mesh'), value='Ward'), Dropdown(descriptio…

Output()

## 4. Residual explorer

Residual = Actual - Predicted. Positive residuals indicate the model under-predicted (actual was higher), while negative values show over-prediction. Use the controls below to switch between box plots and histograms for different levels/models.

In [8]:
# residual distributions for different views
if viz_predictions.empty:
    print('Visualization dataset missing; rerun exports in 03_models_v2.')
else:
    res_level = widgets.ToggleButtons(options=['Ward', 'Mesh'], description='Level')
    res_model = widgets.Dropdown(description='Model')
    res_plot = widgets.ToggleButtons(options=['box', 'hist'], description='View')
    res_split = widgets.Dropdown(options=['train', 'val', 'test'], value='test', description='Split')
    res_group = widgets.Dropdown(options=['None', 'Quarter', 'Year', 'Region'], value='None', description='Group by')
    res_output = widgets.Output()
    res_stats = widgets.Output()

    def update_res_model(*args):
        level = res_level.value
        opts = sorted(viz_predictions[viz_predictions['Level'] == level]['Model'].unique()) if not viz_predictions.empty else []
        res_model.options = opts
        if opts:
            res_model.value = opts[0]
        else:
            res_model.value = None

    res_level.observe(update_res_model, names='value')
    update_res_model()

    def _group_values(df, level, group_choice):
        if group_choice == 'None':
            return None, None
        if group_choice == 'Quarter':
            return 'PeriodKey', df['PeriodKey']
        if group_choice == 'Year':
            if 'QuarterDate' in df.columns:
                return 'Year', df['QuarterDate'].dt.year
            return 'Year', df['PeriodKey'].str.slice(0, 4).astype(int)
        if group_choice == 'Region':
            return LEVEL_ID[level], df[LEVEL_ID[level]]
        return None, None

    def render_residuals(_=None):
        with res_output:
            res_output.clear_output()
            level = res_level.value
            model = res_model.value
            if model is None:
                print('Select a model to continue')
                return
            plot_type = res_plot.value
            split = res_split.value
            focus = viz_predictions[(viz_predictions['Level'] == level) & (viz_predictions['Model'] == model) & (viz_predictions['Split'] == split)].copy()
            if focus.empty:
                print('No residuals for this selection')
                return
            title = f'{level} {model} residuals ({split} split)'
            if plot_type == 'box':
                fig = px.box(focus, y='Residual', points='all', title=title)
            else:
                fig = px.histogram(focus, x='Residual', nbins=60, title=title)
            fig.add_vline(x=0, line_dash='dash', line_color='black')
            fig.show()
        with res_stats:
            res_stats.clear_output()
            diffs = focus['Residual']
            summary = pd.DataFrame({
                'MAE (JPY/m^2)': [np.abs(diffs).mean()],
                'RMSE': [np.sqrt(np.mean(diffs ** 2))],
                'Bias (Actual - Predicted)': [diffs.mean()],
                'Std dev': [diffs.std()],
                'Count': [len(focus)]
            })
            display(summary)
            group_choice = res_group.value
            col_name, values = _group_values(focus, level, group_choice)
            if col_name is not None:
                focus['_Group'] = values
                grouped = (
                    focus.groupby('_Group')['Residual']
                    .agg(MeanResidual='mean', MAE=lambda x: np.mean(np.abs(x)), Count='size')
                    .reset_index()
                    .sort_values('Count', ascending=False)
                    .head(10)
                )
                grouped = grouped.rename(columns={'_Group': col_name})
                display(grouped)

    for widget_control in [res_model, res_plot, res_split, res_group]:
        widget_control.observe(render_residuals, names='value')
    res_level.observe(render_residuals, names='value')
    render_residuals()
    display(widgets.VBox([widgets.HBox([res_level, res_model, res_plot]), widgets.HBox([res_split, res_group])]))
    display(res_output)
    display(res_stats)




HBox(children=(ToggleButtons(description='Level', options=('Ward', 'Mesh'), value='Ward'), Dropdown(descriptio…

Output()

## 5. Geospatial snapshot

In [9]:
# scatter map with color-coded residuals and controllable point size
if viz_predictions.empty or not {'Latitude', 'Longitude'}.issubset(viz_predictions.columns):
    print('Latitude/Longitude missing; ensure model_predictions_viz.csv includes coordinates.')
else:
    geo_split = widgets.Dropdown(options=['train', 'val', 'test'], value='test', description='Split')
    geo_size = widgets.Dropdown(options=['Actual', 'Predicted'], value='Actual', description='Size by')
    geo_output = widgets.Output()

    def render_geo(_=None):
        with geo_output:
            geo_output.clear_output()
            split = geo_split.value
            size_col = geo_size.value
            sample = viz_predictions[viz_predictions['Split'] == split].dropna(subset=['Latitude', 'Longitude']).copy()
            if sample.empty:
                print('No geocoded points for this split')
                return
            sample = sample.sample(n=min(5000, len(sample)), random_state=42)
            sample['MarkerSize'] = sample[size_col].abs().clip(lower=1.0)
            fig = px.scatter_geo(
                sample,
                lat='Latitude',
                lon='Longitude',
                color='Residual',
                hover_name='Ward',
                size='MarkerSize',
                color_continuous_scale='RdBu',
                title=f'Spatial residuals ({split} split, size = {size_col})'
            )
            fig.update_layout(geo_scope='asia')
            fig.show()

    for widget_control in [geo_split, geo_size]:
        widget_control.observe(render_geo, names='value')
    render_geo()
    display(widgets.HBox([geo_split, geo_size]))
    display(geo_output)

HBox(children=(Dropdown(description='Split', index=2, options=('train', 'val', 'test'), value='test'), Dropdow…

Output()

## 6. SHAP explorer

In [10]:
# choose a model + level to view global and local SHAP explanations
if not shap_summary_map:
    print('SHAP outputs missing; rerun 03_models_v2 with SHAP section enabled.')
else:
    shap_level = widgets.Dropdown(options=sorted({lvl for lvl, _ in shap_summary_map.keys()}), description='Level')
    shap_model = widgets.Dropdown(description='Model')
    shap_topn = widgets.IntSlider(min=5, max=25, step=5, value=15, description='Top features')
    shap_obs = widgets.Dropdown(description='Obs index')
    shap_output = widgets.Output()

    def update_shap_models(*args):
        level = shap_level.value
        opts = sorted({model for (lvl, model) in shap_summary_map.keys() if lvl == level})
        shap_model.options = opts
        if opts:
            shap_model.value = opts[0]
        else:
            shap_model.value = None

    def update_shap_observations(*args):
        level = shap_level.value
        model = shap_model.value
        local_path = shap_local_map.get((level, model))
        if local_path and local_path.exists():
            local_df = pd.read_csv(local_path)
            obs_values = sorted(local_df['ObservationIndex'].unique())[:50]
            shap_obs.options = obs_values
            shap_obs.value = obs_values[0] if obs_values else None
        else:
            shap_obs.options = []
            shap_obs.value = None

    shap_level.observe(update_shap_models, names='value')
    shap_level.observe(update_shap_observations, names='value')
    shap_model.observe(update_shap_observations, names='value')
    update_shap_models()
    update_shap_observations()

    def render_shap(_=None):
        with shap_output:
            shap_output.clear_output()
            level = shap_level.value
            model = shap_model.value
            if model is None:
                print('No SHAP model available for this level')
                return
            topn = shap_topn.value
            summary_path = shap_summary_map.get((level, model))
            if summary_path is None or not summary_path.exists():
                print('No SHAP summary for this selection')
                return
            summary_df = pd.read_csv(summary_path)
            fig = px.bar(
                summary_df.head(topn),
                x='MeanAbsSHAP',
                y='Feature',
                orientation='h',
                title=f'{level} {model} SHAP (top {topn})'
            )
            fig.update_layout(yaxis={'categoryorder': 'total ascending'})
            fig.show()

            local_path = shap_local_map.get((level, model))
            if local_path and local_path.exists() and shap_obs.value is not None:
                local_df = pd.read_csv(local_path)
                focus = local_df[local_df['ObservationIndex'] == shap_obs.value]
                if not focus.empty:
                    local_fig = px.bar(
                        focus.sort_values('SHAPValue'),
                        x='SHAPValue',
                        y='Feature',
                        orientation='h',
                        title=f'Observation {shap_obs.value} contributions'
                    )
                    local_fig.add_vline(x=0, line_dash='dash', line_color='black')
                    local_fig.show()
                else:
                    print('Local SHAP rows missing for selected observation')
            else:
                print('Local SHAP file missing for this model.')

    for widget_control in [shap_model, shap_topn, shap_obs]:
        widget_control.observe(render_shap, names='value')
    shap_level.observe(render_shap, names='value')
    render_shap()
    controls = widgets.HBox([shap_level, shap_model, shap_topn, shap_obs])
    display(controls)
    display(shap_output)

HBox(children=(Dropdown(description='Level', options=('Ward',), value='Ward'), Dropdown(description='Model', o…

Output()

## Next steps and notes

- Residual explorer: look for regions/models with skewed distributions (consistently positive = under prediction, negative = over prediction) and feed that back into feature engineering.
- TorchLSTM targets are restored to JPY/m^2 before evaluation; rerun `03_models_v2.ipynb` after the latest fix to refresh leaderboard numbers and CSV exports.
- SHAP explorer currently covers tree models exported from notebook 03; add more SHAP files (e.g., mesh LightGBM, additional layers) to extend the dropdowns here.