# Setup

File setup for project paths and data directories.

In [28]:
from pathlib import Path
import sys
import os

# Detect Colab
IN_COLAB = "google.colab" in sys.modules or "COLAB_GPU" in os.environ

PROJECT_ROOT = None

if IN_COLAB:
    import subprocess
    colab_root = Path("/content/temp-data-pipeline")
    if not (colab_root / "pyproject.toml").exists():
        # Clone repo if not present
        subprocess.run(
            ["git", "clone", "https://github.com/kyler505/temp-data-pipeline.git", str(colab_root)],
            check=True,
        )
    else:
        # Pull latest changes
        subprocess.run(["git", "pull"], cwd=colab_root, check=True)
    PROJECT_ROOT = colab_root
else:
    # Local: search upward for pyproject.toml
    cwd = Path.cwd().resolve()
    for parent in [cwd] + list(cwd.parents):
        if (parent / "pyproject.toml").exists():
            PROJECT_ROOT = parent
            break
    # Fallback to common dev location
    if PROJECT_ROOT is None:
        candidate = Path.home() / "Documents" / "temp-data-pipeline"
        if (candidate / "pyproject.toml").exists():
            PROJECT_ROOT = candidate

if PROJECT_ROOT is None:
    raise FileNotFoundError("Could not find project root. Set PROJECT_ROOT manually.")

# Add to Python path
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
src_path = PROJECT_ROOT / "src"
if src_path.exists() and str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

DATA_DIR = PROJECT_ROOT / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Data dir: {DATA_DIR}")

Project root: /content/temp-data-pipeline
Data dir: /content/temp-data-pipeline/data


# Install packages

Install project dependencies in editable mode if needed.

In [29]:
import sys
import subprocess
import importlib

# Always reinstall in editable mode to pick up any code changes
if (PROJECT_ROOT / "pyproject.toml").exists():
    subprocess.run(
        [sys.executable, "-m", "pip", "install", "-q", "-e", str(PROJECT_ROOT)],
        check=True,
    )
    # Clear cached imports so we get the latest code
    for mod_name in list(sys.modules.keys()):
        if mod_name.startswith("tempdata"):
            del sys.modules[mod_name]
    print("Installed/updated tempdata in editable mode")
else:
    raise FileNotFoundError(
        f"pyproject.toml not found in {PROJECT_ROOT}. "
        "Update PROJECT_ROOT in the setup cell."
    )

Installed/updated tempdata in editable mode


# Fetch NOAA hourly data

Configure a station and date range, then run the fetcher.

In [30]:
from tempdata.fetch.noaa_hourly import fetch_noaa_hourly

STATION_ID = "KLGA"
START_DATE = "2024-01-01"
END_DATE = "2024-02-01"  # exclusive

OUTPUT_DIR = DATA_DIR / "raw" / "noaa_hourly" / STATION_ID
CACHE_DIR = DATA_DIR / "cache" / "isd_csv" / STATION_ID

written = fetch_noaa_hourly(
    station_id=STATION_ID,
    start_date=START_DATE,
    end_date=END_DATE,
    out_dir=OUTPUT_DIR,
    cache_dir=CACHE_DIR,
)

print(f"Wrote {len(written)} parquet files:")
for path in written:
    print(f"  - {path}")

[noaa] 2024: rows=1236 coverage=2024-01-01 00:00:00+00:00 -> 2024-01-31 23:51:00+00:00
Wrote 1 parquet files:
  - /content/temp-data-pipeline/data/raw/noaa_hourly/KLGA/2024.parquet


# Verify outputs

Load one parquet file to confirm the fetch results.

In [31]:
import pandas as pd
from tempdata.schemas import validate_hourly_obs

parquet_files = sorted(OUTPUT_DIR.glob("*.parquet"))
if not parquet_files:
    raise FileNotFoundError(f"No parquet files found in {OUTPUT_DIR}")

df = pd.read_parquet(parquet_files[0])
print(f"Loaded {len(df)} rows from {parquet_files[0].name}")

# Validate schema (will raise if invalid)
validate_hourly_obs(df, require_unique_keys=False)
print("Schema validation passed")

print(df.head())
print(f"Date range: {df['ts_utc'].min()} to {df['ts_utc'].max()}")

Loaded 1236 rows from 2024.parquet
Schema validation passed
                     ts_utc station_id       lat       lon  temp_c source  \
0 2024-01-01 00:00:00+00:00       KLGA  40.77945 -73.88027     6.1   noaa   
1 2024-01-01 00:51:00+00:00       KLGA  40.77945 -73.88027     6.1   noaa   
2 2024-01-01 01:51:00+00:00       KLGA  40.77945 -73.88027     6.1   noaa   
3 2024-01-01 02:51:00+00:00       KLGA  40.77945 -73.88027     6.1   noaa   
4 2024-01-01 03:00:00+00:00       KLGA  40.77945 -73.88027     6.1   noaa   

   qc_flags  
0         0  
1         0  
2         0  
3         0  
4         0  
Date range: 2024-01-01 00:00:00+00:00 to 2024-01-31 23:51:00+00:00


# Clean hourly data

Apply the cleaning pipeline to the fetched data:
- Validate input schema (early fail on malformed data)
- Sort and deduplicate by (ts_utc, station_id)
- Flag missing temperature values
- Flag and nullify out-of-range temperatures
- Detect hour-to-hour spikes

In [32]:
from tempdata.clean import clean_hourly_obs

# Clean the fetched data
# This applies: deduplication, missing value flags, out-of-range handling, spike detection
df_clean = clean_hourly_obs(df)

print(f"\nCleaned DataFrame shape: {df_clean.shape}")
print(df_clean.head())

[clean] Cleaning summary:
  Total rows: 1236 -> 1234 (2 duplicates removed)
  Rows with QC flags: 31
    QC_MISSING_VALUE: 31
  Temp range (valid): -7.2C to 15.0C

Cleaned DataFrame shape: (1234, 7)
                     ts_utc station_id       lat       lon  temp_c source  \
0 2024-01-01 00:00:00+00:00       KLGA  40.77945 -73.88027     6.1   noaa   
1 2024-01-01 00:51:00+00:00       KLGA  40.77945 -73.88027     6.1   noaa   
2 2024-01-01 01:51:00+00:00       KLGA  40.77945 -73.88027     6.1   noaa   
3 2024-01-01 02:51:00+00:00       KLGA  40.77945 -73.88027     6.1   noaa   
4 2024-01-01 03:00:00+00:00       KLGA  40.77945 -73.88027     6.1   noaa   

   qc_flags  
0         0  
1         0  
2         0  
3         0  
4         0  


# Aggregate to Daily Tmax

Convert cleaned hourly observations to daily maximum temperature (Tmax).

Key design principles:
- **Market-aligned**: Tmax is computed per station-local calendar day, not UTC
- **QC-aware**: Hours with `QC_OUT_OF_RANGE` are excluded from Tmax calculation
- **Spike-inclusive**: Spike-flagged values ARE included (to avoid removing real heat spikes)
- **Transparent**: Every day carries `coverage_hours` and propagated `qc_flags`

In [33]:
from tempdata.aggregate.build_daily_tmax import build_daily_tmax
from tempdata.schemas.daily_tmax import validate_daily_tmax

# Station timezone (KLGA is in Eastern time)
STATION_TZ = "America/New_York"

# Build daily Tmax from cleaned hourly data
df_daily = build_daily_tmax(df_clean, station_tz=STATION_TZ)

# Validate the output schema
validate_daily_tmax(df_daily)
print("Daily Tmax schema validation passed")

print(f"\nAggregated {len(df_clean)} hourly obs -> {len(df_daily)} daily records")
print(f"Date range: {df_daily['date_local'].min().date()} to {df_daily['date_local'].max().date()}")

print("\nDaily Tmax summary:")
print(df_daily[["date_local", "tmax_c", "tmax_f", "coverage_hours", "qc_flags"]].head(10))

Daily Tmax schema validation passed

Aggregated 1234 hourly obs -> 32 daily records
Date range: 2023-12-31 to 2024-01-31

Daily Tmax summary:
                 date_local  tmax_c  tmax_f  coverage_hours  qc_flags
0 2023-12-31 00:00:00-05:00     6.7    44.1               5        17
1 2024-01-01 00:00:00-05:00     7.2    45.0              24         1
2 2024-01-02 00:00:00-05:00     6.1    43.0              24         1
3 2024-01-03 00:00:00-05:00     6.1    43.0              24         1
4 2024-01-04 00:00:00-05:00     6.7    44.1              24         1
5 2024-01-05 00:00:00-05:00     3.3    37.9              24         1
6 2024-01-06 00:00:00-05:00     3.3    37.9              24         1
7 2024-01-07 00:00:00-05:00     4.0    39.2              24         1
8 2024-01-08 00:00:00-05:00     6.7    44.1              24         1
9 2024-01-09 00:00:00-05:00    13.9    57.0              24         1


# Coverage and Quality Analysis

Check data quality metrics for the aggregated daily Tmax.

In [34]:
from tempdata.schemas.qc_flags import QC_LOW_COVERAGE, QC_INCOMPLETE_DAY, QC_SPIKE_DETECTED

# Coverage statistics
print("Coverage Statistics:")
print(f"  Min coverage: {df_daily['coverage_hours'].min()} hours")
print(f"  Max coverage: {df_daily['coverage_hours'].max()} hours")
print(f"  Mean coverage: {df_daily['coverage_hours'].mean():.1f} hours")
print(f"  Days with 24h coverage: {(df_daily['coverage_hours'] == 24).sum()}")

# QC flag breakdown
print("\nQC Flag Analysis:")
low_coverage_days = ((df_daily['qc_flags'] & QC_LOW_COVERAGE) != 0).sum()
incomplete_days = ((df_daily['qc_flags'] & QC_INCOMPLETE_DAY) != 0).sum()
spike_days = ((df_daily['qc_flags'] & QC_SPIKE_DETECTED) != 0).sum()

print(f"  Days with QC_LOW_COVERAGE: {low_coverage_days}")
print(f"  Days with QC_INCOMPLETE_DAY: {incomplete_days}")
print(f"  Days with QC_SPIKE_DETECTED: {spike_days}")
print(f"  Days with no QC issues: {(df_daily['qc_flags'] == 0).sum()}")

# Temperature range
print("\nTemperature Range:")
print(f"  Min Tmax: {df_daily['tmax_c'].min():.1f}°C ({df_daily['tmax_f'].min():.1f}°F)")
print(f"  Max Tmax: {df_daily['tmax_c'].max():.1f}°C ({df_daily['tmax_f'].max():.1f}°F)")
print(f"  Mean Tmax: {df_daily['tmax_c'].mean():.1f}°C ({df_daily['tmax_f'].mean():.1f}°F)")

Coverage Statistics:
  Min coverage: 5 hours
  Max coverage: 24 hours
  Mean coverage: 23.2 hours
  Days with 24h coverage: 30

QC Flag Analysis:
  Days with QC_LOW_COVERAGE: 1
  Days with QC_INCOMPLETE_DAY: 0
  Days with QC_SPIKE_DETECTED: 0
  Days with no QC issues: 1

Temperature Range:
  Min Tmax: -3.9°C (25.0°F)
  Max Tmax: 15.0°C (59.0°F)
  Mean Tmax: 5.1°C (41.3°F)


# Save Daily Tmax

Write the daily Tmax data to parquet for downstream use (backtesting, model training, trading validation).

In [35]:
from tempdata.aggregate.build_daily_tmax import write_daily_tmax

# Output paths
DAILY_TMAX_DIR = DATA_DIR / "clean" / "daily_tmax"
DAILY_TMAX_DIR.mkdir(parents=True, exist_ok=True)

output_path = DAILY_TMAX_DIR / f"{STATION_ID}.parquet"

# Write with schema validation
write_daily_tmax(df_daily, output_path)

# Also save cleaned hourly data for reference
HOURLY_CLEAN_DIR = DATA_DIR / "clean" / "hourly_obs" / STATION_ID
HOURLY_CLEAN_DIR.mkdir(parents=True, exist_ok=True)

# Determine year range for partitioning
years = df_clean["ts_utc"].dt.year.unique()
for year in years:
    year_df = df_clean[df_clean["ts_utc"].dt.year == year]
    year_path = HOURLY_CLEAN_DIR / f"{year}.parquet"
    year_df.to_parquet(year_path, index=False)
    print(f"[clean] Wrote {len(year_df)} rows to {year_path}")

print(f"\nPipeline complete!")
print(f"  Daily Tmax: {output_path}")
print(f"  Cleaned hourly: {HOURLY_CLEAN_DIR}")

[aggregate] wrote 32 rows to /content/temp-data-pipeline/data/clean/daily_tmax/KLGA.parquet
[clean] Wrote 1234 rows to /content/temp-data-pipeline/data/clean/hourly_obs/KLGA/2024.parquet

Pipeline complete!
  Daily Tmax: /content/temp-data-pipeline/data/clean/daily_tmax/KLGA.parquet
  Cleaned hourly: /content/temp-data-pipeline/data/clean/hourly_obs/KLGA


# Verify Saved Data

Reload the saved parquet to confirm it was written correctly.

In [36]:
# Reload and verify the saved daily Tmax data
df_verify = pd.read_parquet(output_path)

# Validate schema
validate_daily_tmax(df_verify)
print(f"Reloaded {len(df_verify)} daily records from {output_path.name}")
print("Schema validation passed")

# Show full dataset
print("\nDaily Tmax Data:")
print(df_verify.to_string(index=False))

Reloaded 32 daily records from KLGA.parquet
Schema validation passed

Daily Tmax Data:
               date_local station_id  tmax_c  tmax_f  coverage_hours   source  qc_flags                   updated_at_utc
2023-12-31 00:00:00-05:00       KLGA     6.7    44.1               5 noaa_isd        17 2026-01-19 05:45:23.888984+00:00
2024-01-01 00:00:00-05:00       KLGA     7.2    45.0              24 noaa_isd         1 2026-01-19 05:45:23.888984+00:00
2024-01-02 00:00:00-05:00       KLGA     6.1    43.0              24 noaa_isd         1 2026-01-19 05:45:23.888984+00:00
2024-01-03 00:00:00-05:00       KLGA     6.1    43.0              24 noaa_isd         1 2026-01-19 05:45:23.888984+00:00
2024-01-04 00:00:00-05:00       KLGA     6.7    44.1              24 noaa_isd         1 2026-01-19 05:45:23.888984+00:00
2024-01-05 00:00:00-05:00       KLGA     3.3    37.9              24 noaa_isd         1 2026-01-19 05:45:23.888984+00:00
2024-01-06 00:00:00-05:00       KLGA     3.3    37.9              

# Fetch Open-Meteo Forecast

Ingest daily maximum temperature (Tmax) forecasts from Open-Meteo for the same station.

This creates the **feature-side** dataset: "What did the forecast say at issue time about a target local date?"

Key concepts:
- **Issue time**: when the forecast was fetched (UTC)
- **Target date**: the station-local calendar date being forecasted
- **Lead hours**: hours from issue time to target date midnight in station timezone

In [37]:
from tempdata.fetch.openmeteo_daily_forecast import fetch_openmeteo_daily_tmax_forecast
from tempdata.schemas.daily_tmax_forecast import validate_daily_tmax_forecast

# Fetch forecast for the same station
FORECAST_DAYS = 14

# Output directories
FORECAST_RAW_DIR = DATA_DIR / "raw" / "forecasts" / "openmeteo" / STATION_ID
FORECAST_CLEAN_DIR = DATA_DIR / "clean" / "forecasts" / "openmeteo" / STATION_ID

forecast_files = fetch_openmeteo_daily_tmax_forecast(
    station_id=STATION_ID,
    out_raw_dir=FORECAST_RAW_DIR,
    out_parquet_dir=FORECAST_CLEAN_DIR,
    forecast_days=FORECAST_DAYS,
    write_raw=True,  # Save raw JSON for debugging
)

print(f"Wrote {len(forecast_files)} files:")
for path in forecast_files:
    print(f"  - {path}")

ModuleNotFoundError: No module named 'tempdata.fetch.openmeteo_daily_forecast'

# Verify Forecast Data

Load and validate the forecast parquet, then display a summary.

In [None]:
# Load the forecast parquet
forecast_parquet_files = sorted(FORECAST_CLEAN_DIR.glob("*.parquet"))
if not forecast_parquet_files:
    raise FileNotFoundError(f"No parquet files found in {FORECAST_CLEAN_DIR}")

df_forecast = pd.read_parquet(forecast_parquet_files[-1])  # Most recent
print(f"Loaded {len(df_forecast)} forecast rows from {forecast_parquet_files[-1].name}")

# Validate schema
validate_daily_tmax_forecast(df_forecast)
print("Schema validation passed")

# Display summary
print(f"\nForecast Summary:")
print(f"  Issue time (UTC): {df_forecast['issue_time_utc'].iloc[0]}")
print(f"  Target dates: {df_forecast['target_date_local'].min().date()} to {df_forecast['target_date_local'].max().date()}")
print(f"  Lead hours range: {df_forecast['lead_hours'].min()} to {df_forecast['lead_hours'].max()}")
print(f"  Tmax (C): {df_forecast['tmax_pred_c'].min():.1f} to {df_forecast['tmax_pred_c'].max():.1f}")
print(f"  Tmax (F): {df_forecast['tmax_pred_f'].min():.1f} to {df_forecast['tmax_pred_f'].max():.1f}")

print("\nForecast Data:")
print(df_forecast[["target_date_local", "tmax_pred_c", "tmax_pred_f", "lead_hours"]].to_string(index=False))

# Join Forecasts to Truth

Demonstrate joining forecasts to the truth dataset (`daily_tmax`) on `(station_id, target_date_local)`.

This is the foundation for:
- Model training (forecast features -> actual Tmax labels)
- Backtesting (compare predicted vs actual)
- Error analysis

In [None]:
# Prepare forecast for joining
# Convert target_date_local to date for joining (strip time component)
df_forecast_join = df_forecast.copy()
df_forecast_join["target_date"] = df_forecast_join["target_date_local"].dt.date

# Prepare truth data for joining
# The date_local in daily_tmax is timezone-aware; convert to date for joining
df_truth = df_verify.copy()
df_truth["target_date"] = df_truth["date_local"].dt.date

# Perform the join on (station_id, target_date)
df_joined = df_forecast_join.merge(
    df_truth[["station_id", "target_date", "tmax_c", "tmax_f", "coverage_hours", "qc_flags"]],
    on=["station_id", "target_date"],
    how="inner",
    suffixes=("_pred", "_actual"),
)

if len(df_joined) > 0:
    # Calculate forecast error
    df_joined["error_c"] = df_joined["tmax_pred_c"] - df_joined["tmax_c"]
    df_joined["error_f"] = df_joined["tmax_pred_f"] - df_joined["tmax_f"]
    df_joined["abs_error_c"] = df_joined["error_c"].abs()

    print(f"Joined {len(df_joined)} forecast-truth pairs")
    print(f"\nForecast Error Summary:")
    print(f"  Mean Error (C): {df_joined['error_c'].mean():.2f}")
    print(f"  Mean Absolute Error (C): {df_joined['abs_error_c'].mean():.2f}")
    print(f"  Max Absolute Error (C): {df_joined['abs_error_c'].max():.2f}")

    print("\nJoined Data (pred vs actual):")
    cols = ["target_date", "lead_hours", "tmax_pred_c", "tmax_c", "error_c", "coverage_hours"]
    print(df_joined[cols].to_string(index=False))
else:
    print("No overlapping dates between forecast and truth data.")
    print("This is expected when forecasting future dates that haven't occurred yet.")
    print(f"\nForecast dates: {df_forecast['target_date_local'].min().date()} to {df_forecast['target_date_local'].max().date()}")
    print(f"Truth dates: {df_truth['target_date'].min()} to {df_truth['target_date'].max()}")