# Setup

File setup for project paths and data directories.

In [5]:
from pathlib import Path
import sys
import os

# Detect Colab
IN_COLAB = "google.colab" in sys.modules or "COLAB_GPU" in os.environ

PROJECT_ROOT = None

if IN_COLAB:
    import subprocess
    colab_root = Path("/content/temp-data-pipeline")
    if not (colab_root / "pyproject.toml").exists():
        # Clone repo if not present
        subprocess.run(
            ["git", "clone", "https://github.com/kyler505/temp-data-pipeline.git", str(colab_root)],
            check=True,
        )
    else:
        # Pull latest changes
        subprocess.run(["git", "pull"], cwd=colab_root, check=True)
    PROJECT_ROOT = colab_root
else:
    # Local: search upward for pyproject.toml
    cwd = Path.cwd().resolve()
    for parent in [cwd] + list(cwd.parents):
        if (parent / "pyproject.toml").exists():
            PROJECT_ROOT = parent
            break
    # Fallback to common dev location
    if PROJECT_ROOT is None:
        candidate = Path.home() / "Documents" / "temp-data-pipeline"
        if (candidate / "pyproject.toml").exists():
            PROJECT_ROOT = candidate

if PROJECT_ROOT is None:
    raise FileNotFoundError("Could not find project root. Set PROJECT_ROOT manually.")

# Add to Python path
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
src_path = PROJECT_ROOT / "src"
if src_path.exists() and str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

DATA_DIR = PROJECT_ROOT / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Data dir: {DATA_DIR}")

Project root: /content/temp-data-pipeline
Data dir: /content/temp-data-pipeline/data


# Install packages

Install project dependencies in editable mode if needed.

In [6]:
import sys
import subprocess

try:
    import tempdata  # noqa: F401
    print("tempdata already importable")
except ModuleNotFoundError:
    if (PROJECT_ROOT / "pyproject.toml").exists():
        subprocess.run(
            [sys.executable, "-m", "pip", "install", "-e", str(PROJECT_ROOT)],
            check=True,
        )
        print("Installed project in editable mode")
    else:
        raise FileNotFoundError(
            f"pyproject.toml not found in {PROJECT_ROOT}. "
            "Update PROJECT_ROOT in the setup cell."
        )

tempdata already importable


# Fetch NOAA hourly data

Configure a station and date range, then run the fetcher.

In [7]:
from tempdata.fetch.noaa_hourly import fetch_noaa_hourly

STATION_ID = "KLGA"
START_DATE = "2024-01-01"
END_DATE = "2024-02-01"  # exclusive

OUTPUT_DIR = DATA_DIR / "raw" / "noaa_hourly" / STATION_ID
CACHE_DIR = DATA_DIR / "cache" / "isd_csv" / STATION_ID

written = fetch_noaa_hourly(
    station_id=STATION_ID,
    start_date=START_DATE,
    end_date=END_DATE,
    out_dir=OUTPUT_DIR,
    cache_dir=CACHE_DIR,
)

print(f"Wrote {len(written)} parquet files:")
for path in written:
    print(f"  - {path}")

[noaa] 2024: temp out of bounds (min=-72.0, max=150.0)
[noaa] 2024: rows=1236 coverage=2024-01-01 00:00:00+00:00 -> 2024-01-31 23:51:00+00:00
Wrote 1 parquet files:
  - /content/temp-data-pipeline/data/raw/noaa_hourly/KLGA/2024.parquet


# Verify outputs

Load one parquet file to confirm the fetch results.

In [8]:
import pandas as pd
from tempdata.schemas import validate_hourly_obs

parquet_files = sorted(OUTPUT_DIR.glob("*.parquet"))
if not parquet_files:
    raise FileNotFoundError(f"No parquet files found in {OUTPUT_DIR}")

df = pd.read_parquet(parquet_files[0])
print(f"Loaded {len(df)} rows from {parquet_files[0].name}")

# Validate schema (will raise if invalid)
validate_hourly_obs(df, require_unique_keys=False)
print("Schema validation passed")

print(df.head())
print(f"Date range: {df['ts_utc'].min()} to {df['ts_utc'].max()}")

ImportError: cannot import name 'validate_hourly_obs' from 'tempdata.schemas' (/content/temp-data-pipeline/src/tempdata/schemas/__init__.py)

# Cleaning data (placeholder)

Cleaning logic will be added here after data fetching is implemented.

# Aggregate daily tmax (placeholder)

Aggregation logic will be added here after cleaning is implemented.

# Export artifacts (placeholder)

Exports (CSV/Parquet) will be listed here once the pipeline outputs are finalized.