# Data Pipeline Orchestration

## Setup

File setup for project paths and data directories.

In [10]:
from pathlib import Path
import sys
import os

# Detect Colab
IN_COLAB = "google.colab" in sys.modules or "COLAB_GPU" in os.environ

PROJECT_ROOT = None

if IN_COLAB:
    import subprocess
    colab_root = Path("/content/temp-data-pipeline")
    if not (colab_root / "pyproject.toml").exists():
        # Clone repo if not present
        subprocess.run(
            ["git", "clone", "https://github.com/kyler505/temp-data-pipeline.git", str(colab_root)],
            check=True,
        )
    else:
        # Pull latest changes
        subprocess.run(["git", "pull"], cwd=colab_root, check=True)
    PROJECT_ROOT = colab_root
else:
    # Local: search upward for pyproject.toml
    cwd = Path.cwd().resolve()
    for parent in [cwd] + list(cwd.parents):
        if (parent / "pyproject.toml").exists():
            PROJECT_ROOT = parent
            break
    # Fallback to common dev location
    if PROJECT_ROOT is None:
        candidate = Path.home() / "Documents" / "temp-data-pipeline"
        if (candidate / "pyproject.toml").exists():
            PROJECT_ROOT = candidate

if PROJECT_ROOT is None:
    raise FileNotFoundError("Could not find project root. Set PROJECT_ROOT manually.")

# Add to Python path
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
src_path = PROJECT_ROOT / "src"
if src_path.exists() and str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

DATA_DIR = PROJECT_ROOT / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Data dir: {DATA_DIR}")

Project root: /content/temp-data-pipeline
Data dir: /content/temp-data-pipeline/data


## Install packages

Install project dependencies in editable mode if needed.

In [11]:
import sys
import subprocess
import importlib

# Always reinstall in editable mode to pick up any code changes
if (PROJECT_ROOT / "pyproject.toml").exists():
    subprocess.run(
        [sys.executable, "-m", "pip", "install", "-q", "-e", str(PROJECT_ROOT)],
        check=True,
    )
    # Clear cached imports so we get the latest code
    for mod_name in list(sys.modules.keys()):
        if mod_name.startswith("tempdata"):
            del sys.modules[mod_name]
    print("Installed/updated tempdata in editable mode")
else:
    raise FileNotFoundError(
        f"pyproject.toml not found in {PROJECT_ROOT}. "
        "Update PROJECT_ROOT in the setup cell."
    )

Installed/updated tempdata in editable mode


## Fetch NOAA hourly data

Configure a station and date range, then run the fetcher.

In [12]:
from tempdata.fetch.noaa_hourly import fetch_noaa_hourly

STATION_ID = "KLGA"
START_DATE = "2016-01-01"
END_DATE = "2025-08-27"  # exclusive

OUTPUT_DIR = DATA_DIR / "raw" / "noaa_hourly" / STATION_ID
CACHE_DIR = DATA_DIR / "cache" / "isd_csv" / STATION_ID

written = fetch_noaa_hourly(
    station_id=STATION_ID,
    start_date=START_DATE,
    end_date=END_DATE,
    out_dir=OUTPUT_DIR,
    cache_dir=CACHE_DIR,
)

print(f"Wrote {len(written)} parquet files:")
for path in written:
    print(f"  - {path}")

[noaa] 2016: rows=13375 coverage=2016-01-01 00:00:00+00:00 -> 2016-12-31 23:51:00+00:00
[noaa] 2017: rows=14043 coverage=2017-01-01 00:00:00+00:00 -> 2017-12-31 23:51:00+00:00
[noaa] 2018: rows=14280 coverage=2018-01-01 00:00:00+00:00 -> 2018-12-31 23:51:00+00:00
[noaa] 2019: rows=14081 coverage=2019-01-01 00:00:00+00:00 -> 2019-12-31 23:51:00+00:00
[noaa] 2020: rows=13841 coverage=2020-01-01 00:00:00+00:00 -> 2020-12-31 23:51:00+00:00
[noaa] 2021: rows=13565 coverage=2021-01-01 00:00:00+00:00 -> 2021-12-31 23:51:00+00:00
[noaa] 2022: rows=13653 coverage=2022-01-01 00:00:00+00:00 -> 2022-12-31 23:51:00+00:00
[noaa] 2023: rows=13647 coverage=2023-01-01 00:15:00+00:00 -> 2023-12-31 23:51:00+00:00
[noaa] 2024: rows=13414 coverage=2024-01-01 00:00:00+00:00 -> 2024-12-31 23:51:00+00:00
[noaa] 2025: rows=8808 coverage=2025-01-01 00:00:00+00:00 -> 2025-08-26 23:51:00+00:00
Wrote 10 parquet files:
  - /content/temp-data-pipeline/data/raw/noaa_hourly/KLGA/2016.parquet
  - /content/temp-data-pip

## Verify outputs

Load one parquet file to confirm the fetch results.

In [13]:
import pandas as pd
from tempdata.schemas import validate_hourly_obs

parquet_files = sorted(OUTPUT_DIR.glob("*.parquet"))
if not parquet_files:
    raise FileNotFoundError(f"No parquet files found in {OUTPUT_DIR}")

# Load ALL parquet files and concatenate
dfs = []
for pf in parquet_files:
    df_year = pd.read_parquet(pf)
    dfs.append(df_year)
    print(f"Loaded {len(df_year)} rows from {pf.name}")

df = pd.concat(dfs, ignore_index=True)
print(f"\nTotal: {len(df)} rows from {len(parquet_files)} files")

# Validate schema (will raise if invalid)
validate_hourly_obs(df, require_unique_keys=False)
print("Schema validation passed")

print(df.head())
print(f"Date range: {df['ts_utc'].min()} to {df['ts_utc'].max()}")

Loaded 13494 rows from 2010.parquet
Loaded 13955 rows from 2011.parquet
Loaded 13704 rows from 2012.parquet
Loaded 13659 rows from 2013.parquet
Loaded 13790 rows from 2014.parquet
Loaded 13668 rows from 2015.parquet
Loaded 13375 rows from 2016.parquet
Loaded 14043 rows from 2017.parquet
Loaded 14280 rows from 2018.parquet
Loaded 14081 rows from 2019.parquet
Loaded 13841 rows from 2020.parquet
Loaded 13565 rows from 2021.parquet
Loaded 13653 rows from 2022.parquet
Loaded 13647 rows from 2023.parquet
Loaded 13414 rows from 2024.parquet
Loaded 8808 rows from 2025.parquet

Total: 214977 rows from 16 files
Schema validation passed
                     ts_utc station_id       lat       lon  temp_c source  \
0 2010-01-01 00:00:00+00:00       KLGA  40.77944 -73.88035     1.1   noaa   
1 2010-01-01 00:51:00+00:00       KLGA  40.77944 -73.88035     1.1   noaa   
2 2010-01-01 01:36:00+00:00       KLGA  40.77944 -73.88035     1.0   noaa   
3 2010-01-01 01:51:00+00:00       KLGA  40.77944 -73.88035

## Clean hourly data

Apply the cleaning pipeline to the fetched data:
- Validate input schema (early fail on malformed data)
- Sort and deduplicate by (ts_utc, station_id)
- Flag missing temperature values
- Flag and nullify out-of-range temperatures
- Detect hour-to-hour spikes

In [14]:
from tempdata.clean import clean_hourly_obs

# Clean the fetched data
# This applies: deduplication, missing value flags, out-of-range handling, spike detection
df_clean = clean_hourly_obs(df)

print(f"\nCleaned DataFrame shape: {df_clean.shape}")
print(df_clean.head())

[clean] Cleaning summary:
  Total rows: 214977 -> 214686 (291 duplicates removed)
  Rows with QC flags: 5769
    QC_MISSING_VALUE: 5768
    QC_SPIKE_DETECTED: 1
  Temp range (valid): -17.2C to 39.4C

Cleaned DataFrame shape: (214686, 7)
                     ts_utc station_id       lat       lon  temp_c source  \
0 2010-01-01 00:00:00+00:00       KLGA  40.77944 -73.88035     1.1   noaa   
1 2010-01-01 00:51:00+00:00       KLGA  40.77944 -73.88035     1.1   noaa   
2 2010-01-01 01:36:00+00:00       KLGA  40.77944 -73.88035     1.0   noaa   
3 2010-01-01 01:51:00+00:00       KLGA  40.77944 -73.88035     1.1   noaa   
4 2010-01-01 02:01:00+00:00       KLGA  40.77944 -73.88035     1.0   noaa   

   qc_flags  
0         0  
1         0  
2         0  
3         0  
4         0  


## Aggregate to Daily Tmax

Convert cleaned hourly observations to daily maximum temperature (Tmax).

Key design principles:
- **Market-aligned**: Tmax is computed per station-local calendar day, not UTC
- **QC-aware**: Hours with `QC_OUT_OF_RANGE` are excluded from Tmax calculation
- **Spike-inclusive**: Spike-flagged values ARE included (to avoid removing real heat spikes)
- **Transparent**: Every day carries `coverage_hours` and propagated `qc_flags`

In [15]:
from tempdata.aggregate.build_daily_tmax import build_daily_tmax
from tempdata.schemas.daily_tmax import validate_daily_tmax

# Station timezone (KLGA is in Eastern time)
STATION_TZ = "America/New_York"

# Build daily Tmax from cleaned hourly data
df_daily = build_daily_tmax(df_clean, station_tz=STATION_TZ)

# Validate the output schema
validate_daily_tmax(df_daily)
print("Daily Tmax schema validation passed")

print(f"\nAggregated {len(df_clean)} hourly obs -> {len(df_daily)} daily records")
print(f"Date range: {df_daily['date_local'].min().date()} to {df_daily['date_local'].max().date()}")

print("\nDaily Tmax summary:")
print(df_daily[["date_local", "tmax_c", "tmax_f", "coverage_hours", "qc_flags"]].head(10))

Daily Tmax schema validation passed

Aggregated 214686 hourly obs -> 5718 daily records
Date range: 2009-12-31 to 2025-08-26

Daily Tmax summary:
                 date_local  tmax_c  tmax_f  coverage_hours  qc_flags
0 2009-12-31 00:00:00-05:00     1.1    34.0               5        17
1 2010-01-01 00:00:00-05:00     3.9    39.0              24         1
2 2010-01-02 00:00:00-05:00     0.0    32.0              24         1
3 2010-01-03 00:00:00-05:00    -5.0    23.0              24         1
4 2010-01-04 00:00:00-05:00    -0.6    30.9              24         1
5 2010-01-05 00:00:00-05:00    -0.6    30.9              24         1
6 2010-01-06 00:00:00-05:00     1.1    34.0              24         1
7 2010-01-07 00:00:00-05:00     3.3    37.9              24         1
8 2010-01-08 00:00:00-05:00     1.1    34.0              24         1
9 2010-01-09 00:00:00-05:00    -0.6    30.9              24         1


## Coverage and Quality Analysis

Check data quality metrics for the aggregated daily Tmax.

In [16]:
from tempdata.schemas.qc_flags import QC_LOW_COVERAGE, QC_INCOMPLETE_DAY, QC_SPIKE_DETECTED

# Coverage statistics
print("Coverage Statistics:")
print(f"  Min coverage: {df_daily['coverage_hours'].min()} hours")
print(f"  Max coverage: {df_daily['coverage_hours'].max()} hours")
print(f"  Mean coverage: {df_daily['coverage_hours'].mean():.1f} hours")
print(f"  Days with 24h coverage: {(df_daily['coverage_hours'] == 24).sum()}")

# QC flag breakdown
print("\nQC Flag Analysis:")
low_coverage_days = ((df_daily['qc_flags'] & QC_LOW_COVERAGE) != 0).sum()
incomplete_days = ((df_daily['qc_flags'] & QC_INCOMPLETE_DAY) != 0).sum()
spike_days = ((df_daily['qc_flags'] & QC_SPIKE_DETECTED) != 0).sum()

print(f"  Days with QC_LOW_COVERAGE: {low_coverage_days}")
print(f"  Days with QC_INCOMPLETE_DAY: {incomplete_days}")
print(f"  Days with QC_SPIKE_DETECTED: {spike_days}")
print(f"  Days with no QC issues: {(df_daily['qc_flags'] == 0).sum()}")

# Temperature range
print("\nTemperature Range:")
print(f"  Min Tmax: {df_daily['tmax_c'].min():.1f}°C ({df_daily['tmax_f'].min():.1f}°F)")
print(f"  Max Tmax: {df_daily['tmax_c'].max():.1f}°C ({df_daily['tmax_f'].max():.1f}°F)")
print(f"  Mean Tmax: {df_daily['tmax_c'].mean():.1f}°C ({df_daily['tmax_f'].mean():.1f}°F)")

Coverage Statistics:
  Min coverage: 5 hours
  Max coverage: 24 hours
  Mean coverage: 24.0 hours
  Days with 24h coverage: 5695

QC Flag Analysis:
  Days with QC_LOW_COVERAGE: 1
  Days with QC_INCOMPLETE_DAY: 0
  Days with QC_SPIKE_DETECTED: 1
  Days with no QC issues: 20

Temperature Range:
  Min Tmax: -10.0°C (14.0°F)
  Max Tmax: 39.4°C (102.9°F)
  Mean Tmax: 17.6°C (63.6°F)


## Save Daily Tmax

Write the daily Tmax data to parquet for downstream use (backtesting, model training, trading validation).

In [17]:
from tempdata.aggregate.build_daily_tmax import write_daily_tmax

# Output paths - partition daily Tmax by year like hourly data
DAILY_TMAX_DIR = DATA_DIR / "clean" / "daily_tmax" / STATION_ID
DAILY_TMAX_DIR.mkdir(parents=True, exist_ok=True)

# Determine year range for partitioning
years = df_daily["date_local"].dt.year.unique()
for year in years:
    year_df = df_daily[df_daily["date_local"].dt.year == year]
    year_path = DAILY_TMAX_DIR / f"{year}.parquet"
    year_df.to_parquet(year_path, index=False)
    print(f"[aggregate] Wrote {len(year_df)} rows to {year_path}")

# Also save cleaned hourly data for reference
HOURLY_CLEAN_DIR = DATA_DIR / "clean" / "hourly_obs" / STATION_ID
HOURLY_CLEAN_DIR.mkdir(parents=True, exist_ok=True)

# Determine year range for partitioning
years = df_clean["ts_utc"].dt.year.unique()
for year in years:
    year_df = df_clean[df_clean["ts_utc"].dt.year == year]
    year_path = HOURLY_CLEAN_DIR / f"{year}.parquet"
    year_df.to_parquet(year_path, index=False)
    print(f"[clean] Wrote {len(year_df)} rows to {year_path}")

print(f"\nPipeline complete!")
print(f"  Daily Tmax: {DAILY_TMAX_DIR}")
print(f"  Cleaned hourly: {HOURLY_CLEAN_DIR}")

[aggregate] Wrote 1 rows to /content/temp-data-pipeline/data/clean/daily_tmax/KLGA/2009.parquet
[aggregate] Wrote 365 rows to /content/temp-data-pipeline/data/clean/daily_tmax/KLGA/2010.parquet
[aggregate] Wrote 365 rows to /content/temp-data-pipeline/data/clean/daily_tmax/KLGA/2011.parquet
[aggregate] Wrote 366 rows to /content/temp-data-pipeline/data/clean/daily_tmax/KLGA/2012.parquet
[aggregate] Wrote 365 rows to /content/temp-data-pipeline/data/clean/daily_tmax/KLGA/2013.parquet
[aggregate] Wrote 365 rows to /content/temp-data-pipeline/data/clean/daily_tmax/KLGA/2014.parquet
[aggregate] Wrote 365 rows to /content/temp-data-pipeline/data/clean/daily_tmax/KLGA/2015.parquet
[aggregate] Wrote 366 rows to /content/temp-data-pipeline/data/clean/daily_tmax/KLGA/2016.parquet
[aggregate] Wrote 365 rows to /content/temp-data-pipeline/data/clean/daily_tmax/KLGA/2017.parquet
[aggregate] Wrote 365 rows to /content/temp-data-pipeline/data/clean/daily_tmax/KLGA/2018.parquet
[aggregate] Wrote 365 

## Verify Saved Data

Reload the saved parquet to confirm it was written correctly.

In [18]:
# Reload and verify the saved daily Tmax data (partitioned by year)
daily_tmax_files = list(DAILY_TMAX_DIR.glob("*.parquet"))
daily_tmax_dfs = []
for f in daily_tmax_files:
    df_year = pd.read_parquet(f)
    daily_tmax_dfs.append(df_year)
    print(f"Loaded {len(df_year)} rows from {f.name}")

df_verify = pd.concat(daily_tmax_dfs, ignore_index=True)
print(f"\nTotal: {len(df_verify)} daily records from {len(daily_tmax_files)} files")

# Validate schema
validate_daily_tmax(df_verify)
print("Schema validation passed")

# Show sample of dataset
print("\nDaily Tmax Data (first 10 rows):")
print(df_verify.head(10).to_string(index=False))

Loaded 365 rows from 2021.parquet
Loaded 365 rows from 2018.parquet
Loaded 365 rows from 2022.parquet
Loaded 366 rows from 2012.parquet
Loaded 365 rows from 2014.parquet
Loaded 238 rows from 2025.parquet
Loaded 366 rows from 2024.parquet
Loaded 365 rows from 2010.parquet
Loaded 1 rows from 2009.parquet
Loaded 365 rows from 2011.parquet
Loaded 366 rows from 2016.parquet
Loaded 365 rows from 2015.parquet
Loaded 365 rows from 2013.parquet
Loaded 365 rows from 2019.parquet
Loaded 366 rows from 2020.parquet
Loaded 365 rows from 2023.parquet
Loaded 365 rows from 2017.parquet

Total: 5718 daily records from 17 files
Schema validation passed

Daily Tmax Data (first 10 rows):
               date_local station_id  tmax_c  tmax_f  coverage_hours   source  qc_flags                   updated_at_utc
2021-01-01 00:00:00-05:00       KLGA     5.0    41.0              24 noaa_isd         1 2026-01-19 19:36:15.075180+00:00
2021-01-02 00:00:00-05:00       KLGA    11.1    52.0              24 noaa_isd     

## Fetch Open-Meteo Historical Forecasts

Ingest **historical** daily Tmax forecasts from Open-Meteo for the same station and date range as the truth data.

This creates the **feature-side** dataset: "What did the forecast say at issue time about a target local date?"

Key concepts:
- **Issue time**: when the forecast was issued (simulated as midnight UTC of the day before target)
- **Target date**: the station-local calendar date being forecasted
- **Lead hours**: hours from issue time to target date midnight in station timezone

Using historical forecasts allows us to join forecasts to truth data for model training and backtesting.

In [19]:
from tempdata.fetch.openmeteo_daily_forecast import fetch_openmeteo_historical_forecasts
from tempdata.schemas.daily_tmax_forecast import validate_daily_tmax_forecast

# Use the same date range as the truth data (from NOAA fetch)
FORECAST_START_DATE = START_DATE  # e.g., "2024-01-01"
FORECAST_END_DATE = END_DATE      # e.g., "2024-02-01" (exclusive for NOAA, but inclusive for Open-Meteo)

# Adjust end date: NOAA uses exclusive end, Open-Meteo uses inclusive
# Subtract 1 day from END_DATE to match the truth data range
from datetime import datetime, timedelta
end_dt = datetime.strptime(FORECAST_END_DATE, "%Y-%m-%d") - timedelta(days=1)
forecast_end_date = end_dt.strftime("%Y-%m-%d")

print(f"Fetching historical forecasts for {STATION_ID}")
print(f"Date range: {FORECAST_START_DATE} to {forecast_end_date}")

# Output directories
FORECAST_RAW_DIR = DATA_DIR / "raw" / "forecasts" / "openmeteo" / STATION_ID
FORECAST_CLEAN_DIR = DATA_DIR / "clean" / "forecasts" / "openmeteo" / STATION_ID

forecast_files, df_forecast = fetch_openmeteo_historical_forecasts(
    station_id=STATION_ID,
    start_date=FORECAST_START_DATE,
    end_date=forecast_end_date,
    out_raw_dir=FORECAST_RAW_DIR,
    out_parquet_dir=FORECAST_CLEAN_DIR,
    write_raw=True,  # Save raw JSON for debugging
)

print(f"\nWrote {len(forecast_files)} files:")
for path in forecast_files:
    print(f"  - {path}")

Fetching historical forecasts for KLGA
Date range: 2016-01-01 to 2025-08-26

Wrote 2 files:
  - /content/temp-data-pipeline/data/raw/forecasts/openmeteo/KLGA/historical_2016-01-01_to_2025-08-26.json
  - /content/temp-data-pipeline/data/clean/forecasts/openmeteo/KLGA/historical_2016-01-01_to_2025-08-26.parquet


## Verify Forecast Data

Load and validate the forecast parquet, then display a summary.

In [20]:
# df_forecast is already returned from fetch_openmeteo_historical_forecasts
print(f"Loaded {len(df_forecast)} forecast rows")

# Validate schema (already validated in fetch, but double-check)
validate_daily_tmax_forecast(df_forecast)
print("Schema validation passed")

# Display summary
print(f"\nForecast Summary:")
print(f"  Target dates: {df_forecast['target_date_local'].min().date()} to {df_forecast['target_date_local'].max().date()}")
print(f"  Lead hours range: {df_forecast['lead_hours'].min()} to {df_forecast['lead_hours'].max()}")
print(f"  Tmax (C): {df_forecast['tmax_pred_c'].min():.1f} to {df_forecast['tmax_pred_c'].max():.1f}")
print(f"  Tmax (F): {df_forecast['tmax_pred_f'].min():.1f} to {df_forecast['tmax_pred_f'].max():.1f}")

print("\nForecast Data (first 10 rows):")
print(df_forecast[["target_date_local", "tmax_pred_c", "tmax_pred_f", "lead_hours"]].head(10).to_string(index=False))

Loaded 2796 forecast rows
Schema validation passed

Forecast Summary:
  Target dates: 2017-12-31 to 2025-08-26
  Lead hours range: 28 to 29
  Tmax (C): -12.1 to 39.4
  Tmax (F): 10.2 to 102.9

Forecast Data (first 10 rows):
target_date_local  tmax_pred_c  tmax_pred_f  lead_hours
       2017-12-31        -12.1        10.22          29
       2018-01-01         -7.7        18.14          29
       2018-01-02         -3.6        25.52          29
       2018-01-03         -2.0        28.40          29
       2018-01-04         -2.8        26.96          29
       2018-01-05         -7.2        19.04          29
       2018-01-06        -10.5        13.10          29
       2018-01-07         -7.9        17.78          29
       2018-01-08         -0.5        31.10          29
       2018-01-09          5.4        41.72          29


## Feature Engineering for Daily Tmax

Transform forecasts and truth data into a **model-ready training dataset**.

This section uses:
- **Real truth data** (`df_daily`) from the NOAA aggregation step above
- **Real historical forecasts** (`df_forecast`) from the Open-Meteo fetch above

The feature engineering pipeline:
1. Joins forecasts to truth on `(station_id, target_date_local)`
2. Filters low-quality truth days by coverage
3. Adds seasonal encodings: `sin_doy`, `cos_doy`, `month`
4. Computes rolling bias/error statistics: `bias_7d`, `bias_14d`, `bias_30d`, `rmse_14d`, `rmse_30d`, `sigma_lead`

All rolling features use `.shift(1)` to ensure **no lookahead** — each row's features are computed only from prior data.

In [21]:
# Use real data from previous cells:
# - df_daily: NOAA observations aggregated to daily Tmax (truth)
# - df_forecast: Open-Meteo historical forecasts

# Prepare truth data for feature engineering
df_truth_for_features = df_daily.copy()

print(f"Using real truth data: {len(df_truth_for_features)} days")
print(f"  Date range: {df_truth_for_features['date_local'].min().date()} to {df_truth_for_features['date_local'].max().date()}")

# Use the real Open-Meteo historical forecasts
df_forecast_for_features = df_forecast.copy()

print(f"\nUsing real Open-Meteo historical forecasts:")
print(f"  Forecast rows: {len(df_forecast_for_features)}")
print(f"  Lead times: {sorted(df_forecast_for_features['lead_hours'].unique())} hours")
print(f"  Target date range: {df_forecast_for_features['target_date_local'].min().date()} to {df_forecast_for_features['target_date_local'].max().date()}")

Using real truth data: 5718 days
  Date range: 2009-12-31 to 2025-08-26

Using real Open-Meteo historical forecasts:
  Forecast rows: 2796
  Lead times: [np.int64(28), np.int64(29)] hours
  Target date range: 2017-12-31 to 2025-08-26


## Build Training Dataset

Run the feature engineering pipeline to create model-ready features.

In [22]:
from tempdata.features.build_train_daily_tmax import build_train_daily_tmax
from tempdata.schemas.train_daily_tmax import validate_train_daily_tmax, TRAIN_DAILY_TMAX_FIELDS

# Build the training dataset using real truth data + real Open-Meteo forecasts
# This performs: join, seasonal features, rolling bias/error stats, validation
df_train = build_train_daily_tmax(
    forecast_df=df_forecast_for_features,
    truth_df=df_truth_for_features,
    min_coverage_hours=18,  # Filter low-quality truth days
    drop_warmup_nulls=False,  # Keep warm-up rows (they have NaN in rolling features)
    validate=True,
)

print(f"Training dataset: {len(df_train)} rows")
print(f"Columns: {list(df_train.columns)}")
print(f"\nColumn types:")
for col in df_train.columns:
    print(f"  {col}: {df_train[col].dtype}")

Training dataset: 2796 rows
Columns: ['station_id', 'issue_time_utc', 'target_date_local', 'tmax_pred_f', 'lead_hours', 'forecast_source', 'sin_doy', 'cos_doy', 'month', 'bias_7d', 'bias_14d', 'bias_30d', 'rmse_14d', 'rmse_30d', 'sigma_lead', 'tmax_actual_f']

Column types:
  station_id: object
  issue_time_utc: datetime64[ns, UTC]
  target_date_local: datetime64[ns]
  tmax_pred_f: float64
  lead_hours: int64
  forecast_source: object
  sin_doy: float64
  cos_doy: float64
  month: int32
  bias_7d: float64
  bias_14d: float64
  bias_30d: float64
  rmse_14d: float64
  rmse_30d: float64
  sigma_lead: float64
  tmax_actual_f: float64


## Inspect Features

Examine the generated features, focusing on rolling statistics and seasonal encodings.

In [23]:
# Display core features and seasonal encodings
display_cols = [
    "target_date_local", "lead_hours", "tmax_pred_f", "tmax_actual_f",
    "sin_doy", "cos_doy", "month"
]
print("Core Features & Seasonal Encodings (first 10 rows):")
print(df_train[display_cols].head(10).to_string(index=False))

# Display rolling bias/error features
rolling_cols = [
    "target_date_local", "lead_hours", "bias_7d", "bias_14d", "bias_30d",
    "rmse_14d", "rmse_30d", "sigma_lead"
]
print("\n\nRolling Bias & Error Features (rows 10-20, after warm-up):")
print(df_train[rolling_cols].iloc[10:20].to_string(index=False))

Core Features & Seasonal Encodings (first 10 rows):
target_date_local  lead_hours  tmax_pred_f  tmax_actual_f  sin_doy  cos_doy  month
       2018-03-12          28        43.52           42.1 0.939570 0.342357      3
       2018-03-13          28        41.00           41.0 0.945320 0.326144      3
       2018-03-14          28        40.82           42.1 0.950790 0.309835      3
       2018-03-15          28        46.94           48.0 0.955979 0.293434      3
       2018-03-16          28        39.56           39.0 0.960885 0.276946      3
       2018-03-17          28        47.66           48.0 0.965507 0.260376      3
       2018-03-18          28        43.70           44.1 0.969843 0.243730      3
       2018-03-19          28        46.58           46.9 0.973892 0.227011      3
       2018-03-20          28        39.74           39.0 0.977653 0.210225      3
       2018-03-21          28        39.56           39.9 0.981125 0.193376      3


Rolling Bias & Error Features (ro

## Verify No-Lookahead Property

Confirm that rolling features are computed correctly with `.shift(1)` — each row's features should only use prior data.

In [24]:
# Verify no-lookahead: The first row for each (station, lead_hours) group should have NaN bias
# because there's no prior data to compute rolling stats from

first_rows = df_train.groupby(["station_id", "lead_hours"]).first()
print("First row per (station_id, lead_hours) group — bias_7d should be NaN:")
print(first_rows[["bias_7d", "bias_14d", "bias_30d"]].to_string())

# Compute actual residual for a specific row and verify it's NOT in its own bias
# Pick a row after warm-up period
if len(df_train) > 10:
    test_idx = 10
    test_row = df_train.iloc[test_idx]
    actual_residual = test_row["tmax_pred_f"] - test_row["tmax_actual_f"]

    print(f"\n\nVerification for row {test_idx}:")
    print(f"  Actual residual (pred - actual): {actual_residual:.2f}°F")
    print(f"  bias_7d (computed from PRIOR 7 days): {test_row['bias_7d']:.2f}°F")
    print(f"  These should be different values (bias excludes current row)")

    # The bias_7d should NOT equal the current residual (unless by coincidence)
    print(f"\n  Residual == bias_7d? {abs(actual_residual - test_row['bias_7d']) < 0.01}")

First row per (station_id, lead_hours) group — bias_7d should be NaN:
                       bias_7d  bias_14d  bias_30d
station_id lead_hours                             
KLGA       28             1.42      1.42      1.42
           29           -10.78    -10.78    -10.78


Verification for row 10:
  Actual residual (pred - actual): -5.78°F
  bias_7d (computed from PRIOR 7 days): -0.17°F
  These should be different values (bias excludes current row)

  Residual == bias_7d? False


## Analyze Forecast Bias by Lead Time

The rolling bias features capture systematic forecast errors that vary by lead time.

In [25]:
# Compute actual residuals for analysis
df_train["residual"] = df_train["tmax_pred_f"] - df_train["tmax_actual_f"]

# Analyze bias by lead time
print("Forecast Error Analysis by Lead Time:")
print("=" * 60)

bias_by_lead = df_train.groupby("lead_hours").agg({
    "residual": ["mean", "std", "count"],
    "bias_30d": "mean",  # Average of the rolling bias feature
    "sigma_lead": "mean",  # Average sigma_lead
}).round(2)

bias_by_lead.columns = ["Mean Error (°F)", "Std Dev (°F)", "Count", "Avg bias_30d", "Avg sigma_lead"]
print(bias_by_lead.to_string())

print("\n\nKey Insight:")
print("  - sigma_lead captures uncertainty and can be used for confidence intervals")
print("  - bias_30d provides a rolling estimate that adapts to recent forecast performance")

Forecast Error Analysis by Lead Time:
            Mean Error (°F)  Std Dev (°F)  Count  Avg bias_30d  Avg sigma_lead
lead_hours                                                                    
28                    -0.09          1.60   1836         -0.10            1.65
29                    -0.46          1.46    960         -0.51            1.68


Key Insight:
  - sigma_lead captures uncertainty and can be used for confidence intervals
  - bias_30d provides a rolling estimate that adapts to recent forecast performance


## Save Training Dataset

Write the feature-engineered training dataset to parquet for model training.

In [26]:
from tempdata.features.build_train_daily_tmax import write_train_daily_tmax

# Create output directory
TRAIN_DIR = DATA_DIR / "train" / "daily_tmax" / STATION_ID
TRAIN_DIR.mkdir(parents=True, exist_ok=True)

# Drop rows with NaN rolling features for clean training data
df_train_clean = df_train.dropna(subset=["bias_7d", "bias_14d", "bias_30d", "rmse_14d", "rmse_30d", "sigma_lead"])
print(f"Rows after dropping warm-up NaNs: {len(df_train_clean)} (was {len(df_train)})")

# Select only the schema columns (drop residual which was for analysis)
df_train_final = df_train_clean[TRAIN_DAILY_TMAX_FIELDS].copy()

# Write to parquet
output_path = TRAIN_DIR / "train_daily_tmax.parquet"
write_train_daily_tmax(df_train_final, output_path)

print(f"\nTraining dataset saved to: {output_path}")

Rows after dropping warm-up NaNs: 2792 (was 2796)
[features] wrote 2792 rows to /content/temp-data-pipeline/data/train/daily_tmax/KLGA/train_daily_tmax.parquet

Training dataset saved to: /content/temp-data-pipeline/data/train/daily_tmax/KLGA/train_daily_tmax.parquet


## Feature Engineering Summary

The `train_daily_tmax` dataset is now ready for model training with:

**Input Data:**
- Truth: Real NOAA observations aggregated to daily Tmax
- Forecasts: Real Open-Meteo historical forecasts

**Core Features:**
- `tmax_pred_f`: Raw forecast (the baseline to beat)
- `lead_hours`: Forecast horizon (longer = more uncertainty)
- `forecast_source`: Model identifier for multi-model ensembles

**Seasonal Encodings:**
- `sin_doy`, `cos_doy`: Capture annual temperature cycles
- `month`: Coarse seasonal regime

**Rolling Bias/Error Statistics (key value-add):**
- `bias_7d`, `bias_14d`, `bias_30d`: Recent forecast bias (forecast - observed)
- `rmse_14d`, `rmse_30d`: Recent forecast error magnitude
- `sigma_lead`: Historical uncertainty for this lead time

**Label:**
- `tmax_actual_f`: Observed maximum temperature (ground truth)

In [27]:
# Pipeline complete - summary of outputs
print("Pipeline outputs:")
print(f"  - Raw NOAA hourly data: {OUTPUT_DIR}")
print(f"  - Cleaned hourly observations: {HOURLY_CLEAN_DIR}")
print(f"  - Daily Tmax (truth): {DAILY_TMAX_DIR / f'{STATION_ID}.parquet'}")
print(f"  - Open-Meteo forecasts: {FORECAST_CLEAN_DIR}")
print(f"  - Training dataset: {output_path}")

Pipeline outputs:
  - Raw NOAA hourly data: /content/temp-data-pipeline/data/raw/noaa_hourly/KLGA
  - Cleaned hourly observations: /content/temp-data-pipeline/data/clean/hourly_obs/KLGA
  - Daily Tmax (truth): /content/temp-data-pipeline/data/clean/daily_tmax/KLGA/KLGA.parquet
  - Open-Meteo forecasts: /content/temp-data-pipeline/data/clean/forecasts/openmeteo/KLGA
  - Training dataset: /content/temp-data-pipeline/data/train/daily_tmax/KLGA/train_daily_tmax.parquet
