# Temperature Model Development

This notebook orchestrates the training and evaluation of temperature forecasting models.

**Goals:**
1.  **Baseline comparison**: Evaluate Persistence and kNN models against current Ridge baseline.
2.  **Model development**: Iteratively develop and tune advanced models (XGBoost, Random Forest, LSTM).
3.  **Ensembling**: Combine model outputs to improve performance.

In [None]:
import os
import sys
from pathlib import Path
from types import ModuleType
import importlib

# --- 1. Mount Google Drive ---
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

# --- 2. Setup Workspace ---
if IN_COLAB:
    REPO_NAME = "temp-data-pipeline"
    WORKSPACE_DIR = Path(f"/content/drive/MyDrive/{REPO_NAME}")
    REPO_URL = f"https://github.com/kyler505/{REPO_NAME}.git"

    # Clone if missing
    if not WORKSPACE_DIR.exists():
        print(f"Cloning {REPO_NAME} to Drive...")
        !git clone {REPO_URL} {str(WORKSPACE_DIR)}

    os.chdir(WORKSPACE_DIR)
    if str(WORKSPACE_DIR) not in sys.path:
        sys.path.insert(0, str(WORKSPACE_DIR))
else:
    # Local development setup
    cwd = Path.cwd().resolve()
    project_root = None
    for parent in [cwd] + list(cwd.parents):
        if (parent / "pyproject.toml").exists():
            project_root = parent
            break
    if project_root:
        os.chdir(project_root)
        if str(project_root) not in sys.path:
            sys.path.insert(0, str(project_root))
        print(f"Local environment ready: {project_root}")

# --- 3. Python 3.12 Compatibility Shim (imp module) ---
try:
    import imp
except ImportError:
    print("Applying 'imp' module shim...")
    imp_shim = ModuleType("imp")
    imp_shim.reload = importlib.reload
    sys.modules["imp"] = imp_shim

# --- 4. Run Bootstrap ---
try:
    from tempdata.utils.colab import bootstrap
    bootstrap(use_wandb=True)
except ImportError:
    print("Failed to import bootstrap utility. Ensure repo is on sys.path.")

print("Environment initialization complete.")

## 1. Setup & Data Loading

In [2]:
# Create base configuration for loading data
base_config = EvalConfig(
    run_name="dev_baseline",
    station_ids=[STATION],
    start_date_local=START_DATE,
    end_date_local=END_DATE,
    split=SplitConfig(
        type="static",
        train_frac=0.7,
        val_frac=0.15,
        test_frac=0.15
    )
)

# Load data (this might take a moment)
# Note: In a real run, we'd use scripts/eval_daily_tmax.py's load_data helper
# Here we manually constructing inputs or mocking for demonstration if files exist
# We will rely on the library to handle file loading if paths are standard

# Re-using the load_data logic from the script is a bit tricky without copying it.
# For now, we will assume standard paths or use the EvalConfig's load_dataset capability if it exists.
# Actually, EvalConfig doesn't hold data paths, logic is in script.
# Let's define a helper here to load standard paths:

from tempdata.config import data_root

def load_standard_data(station, start_date, end_date):
    # Attempt to load cleaned data directly
    # This is a simplified version of the CLI loader
    # Truth
    truth_dir = data_root() / "clean" / "daily_tmax" / station
    truth_files = sorted(list(truth_dir.glob("*.parquet")))
    truth_df = pd.concat([pd.read_parquet(f) for f in truth_files])

    # Forecast (Open-Meteo for recent years)
    fc_dir = data_root() / "clean" / "forecasts" / "openmeteo" / station
    if not fc_dir.exists():
         fc_dir = data_root() / "raw" / "forecasts" / "openmeteo" / station
    fc_files = sorted(list(fc_dir.glob("*.parquet")))
    forecast_df = pd.concat([pd.read_parquet(f) for f in fc_files])

    return forecast_df, truth_df

try:
    forecast_raw, truth_raw = load_standard_data(STATION, START_DATE, END_DATE)
    print(f"Loaded {len(forecast_raw)} forecast rows and {len(truth_raw)} truth rows.")

    # Process into Dataset object (handles joining, feature engineering, splitting)
    dataset = load_eval_data(
        config=base_config,
        forecast_df=forecast_raw,
        truth_df=truth_raw
    )
    print(f"Dataset ready. Train: {len(dataset.train)}, Val: {len(dataset.val)}, Test: {len(dataset.test)}")

except Exception as e:
    print(f"Error loading data: {e}")
    print("Ensure you have run the data pipeline (notebooks/temp_data_pipeline.ipynb) to populate data/clean.")

# Initialize global model registry
model_configs = {}

Loaded 5354 forecast rows and 5718 truth rows.
Dataset ready. Train: 2557, Val: 548, Test: 549


## 2. Baseline Models Experimentation

We will evaluate three baseline approaches:
1.  **Persistence**: Tomorrow = Today.
2.  **Ridge**: Simple linear correction of forecast.
3.  **kNN**: Nearest neighbors in feature space.

In [3]:
# Define Baseline Configs
models_to_test = [
    ("Persistence", ModelConfig(type="persistence")),
    ("Ridge", ModelConfig(type="ridge", alpha=1.0)),
    ("kNN (k=50)", ModelConfig(type="knn"))
]

for name, model_conf in models_to_test:
    print(f"Adding {name} to registry...")
    model_configs[name] = EvalConfig(
        run_name=f"dev_{name.lower().replace(' ', '_')}",
        station_ids=[STATION],
        start_date_local=START_DATE,
        end_date_local=END_DATE,
        split=base_config.split,
        model=model_conf
    )

print(f"Current models: {list(model_configs.keys())}")

# Execute Baselines
print("Running baseline evaluation...")
multimodel_result = run_multi_model_evaluation(
    configs=model_configs,
    forecast_df=forecast_raw,
    truth_df=truth_raw,
    verbose=True
)
results = multimodel_result.results
print("Baseline evaluation complete.")


Adding Persistence to registry...
Adding Ridge to registry...
Adding kNN (k=50) to registry...
Current models: ['Persistence', 'Ridge', 'kNN (k=50)']
Running baseline evaluation...

MULTI-MODEL EVALUATION: 20260121_173246
Models: Persistence, Ridge, kNN (k=50)


--- Evaluating Model: Persistence ---

[eval] Fitting model: persistence
[eval] Generating predictions...
[eval] Computing metrics...

EVALUATION METRICS SUMMARY

--- FORECAST PERFORMANCE ---
  Samples:     549
  MAE:         2.57°F
  RMSE:        4.74°F
  Bias:        +0.00°F
  Std Error:   4.74°F

--- CALIBRATION ---
  Mean σ:      5.61°F
  50% PI cov:  71.8% (target: 50%)
  80% PI cov:  88.0% (target: 80%)
  90% PI cov:  93.8% (target: 90%)
  90% width:   18.5°F


--- Evaluating Model: Ridge ---

[eval] Fitting model: ridge
[eval] Generating predictions...
[eval] Computing metrics...

EVALUATION METRICS SUMMARY

--- FORECAST PERFORMANCE ---
  Samples:     549
  MAE:         1.15°F
  RMSE:        1.40°F
  Bias:        +0.73°F

## 3. Advanced Models

Planned models for implementation:
- **XGBoost**: Gradient boosted trees for non-linear bias correction.
- **Random Forest**: Ensemble of trees for robustness.
- **LSTM/GRU**: Sequential models to capture temporal dynamics.

### 3.1 XGBoost: Initial Analysis (Stabilization)

We start by training a default XGBoost model with Early Stopping to check stability, learning curves, and feature importance.

In [None]:
from tempdata.eval.models import XGBoostForecaster
from xgboost import plot_importance
from sklearn.model_selection import ParameterGrid

print("Initializing XGBoost...")

# --- Initial Config & Training ---
xgb_params_init = {
    "objective": "reg:squarederror",
    "n_estimators": 1000,
    "learning_rate": 0.05,
    "max_depth": 6,
    "min_child_weight": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "early_stopping_rounds": 50,
    "tree_method": "hist"
}

# Instantiate from library
xgb_model = XGBoostForecaster(hyperparams=xgb_params_init)

# Train with Early Stopping (using eval_set passed to library model)
xgb_model.fit(
    dataset.train,
    eval_set=[dataset.val],
    verbose=False
)

print(f"Initial Best Score (RMSE): {xgb_model.model.best_score:.4f}")

In [None]:
# --- Diagnostics: Learning Curves & Importance ---
results_dict = xgb_model.model.evals_result()
epochs = len(results_dict['validation_0']['rmse'])
x_axis = range(0, epochs)

plt.figure(figsize=(10, 4))
plt.plot(x_axis, results_dict['validation_0']['rmse'], label='Validation RMSE')
plt.title('XGBoost Learning Curve (Initial)')
plt.ylabel('RMSE')
plt.xlabel('Epochs')
plt.legend()
plt.show()

plt.figure(figsize=(10, 6))
plot_importance(xgb_model.model, max_num_features=20, title='Feature Importance (Initial)')
plt.show()

### 3.2 XGBoost: Hyperparameter Tuning

In [None]:
print("\n[Tuning] Running Grid Search...")

param_grid = {
    "max_depth": [4, 6],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.7, 0.9]
}

best_score = float("inf")
best_params = xgb_params_init.copy()

for params in ParameterGrid(param_grid):
    current_params = xgb_params_init.copy()
    current_params.update(params)

    temp_model = XGBoostForecaster(hyperparams=current_params)
    temp_model.fit(dataset.train, eval_set=[dataset.val], verbose=False)

    score = temp_model.model.best_score
    if score < best_score:
        best_score = score
        best_params = current_params
        print(f"New best: {score:.4f} with {params}")

print(f"Best Params: {best_params}")

### 3.3 XGBoost: Final Training & Integration

In [None]:
print("\n[Final] Training Final Model...")
final_xgb = XGBoostForecaster(hyperparams=best_params)
final_xgb.fit(dataset.train, eval_set=[dataset.val], verbose=False)

# --- Integration ---
# Generate predictions
preds_test = final_xgb.predict_mu(dataset.test)

# Construct EvalResult
xgb_preds_df = dataset.test.copy()
xgb_preds_df["y_pred_f"] = preds_test
xgb_preds_df["y_true_f"] = dataset.test["tmax_actual_f"]

from tempdata.eval.metrics import compute_forecast_metrics, EvalMetrics
f_met = compute_forecast_metrics(xgb_preds_df)
xgb_metrics = EvalMetrics(forecast=f_met, calibration=None, slices=None)

# Add to results (Ensure 'results' dict exists!)
if 'results' not in globals():
    results = {}

xgb_config = EvalConfig(
    run_name="dev_xgboost_tuned",
    station_ids=base_config.station_ids,
    start_date_local=base_config.start_date_local,
    end_date_local=base_config.end_date_local,
    split=base_config.split,
    model=ModelConfig(type="xgboost", hyperparams=best_params)
)

results["XGBoost"] = EvalResult(
    run_id="manual_xgb",
    config=xgb_config,
    predictions_df=xgb_preds_df,
    metrics=xgb_metrics,
    artifacts={}
)
print("XGBoost integration complete.")

In [None]:
# TODO: Implement RandomForestForecaster
# Config placeholder:
# rf_config = ModelConfig(type="random_forest", hyperparams={"n_estimators": 100})

print("Random Forest implementation pending...")

In [None]:
# TODO: Implement LSTM/GRU using PyTorch or TensorFlow
# These will likely need a different data loader to handle sequences/windows

print("Deep Learning models (LSTM/GRU) implementation pending...")

## 4. Ensembling Experiment

Combine predictions from the best models.
Simple Average Ensemble implementation strategy:
1. Collect predictions from multiple models on the *same* test set.
2. Average `y_pred_f` columns.
3. Compute metrics on averaged prediction.

In [None]:
if len(results) >= 2:
    print("Simulating Ensemble (Ridge + kNN)...")

    # Use results from the multi-model run
    p1_df = results["Ridge"].predictions_df
    p2_df = results["kNN (k=50)"].predictions_df

    # Align indices (just to be safe, though they should match)
    common_idx = p1_df.index.intersection(p2_df.index)
    p1 = p1_df.loc[common_idx, "y_pred_f"]
    p2 = p2_df.loc[common_idx, "y_pred_f"]
    y_true = p1_df.loc[common_idx, "y_true_f"]

    # Simple Average
    ensemble_pred = (p1 + p2) / 2

    # Calculate Metrics
    ens_mae = (ensemble_pred - y_true).abs().mean()
    ens_rmse = ((ensemble_pred - y_true)**2).mean() ** 0.5
    ens_bias = (ensemble_pred - y_true).mean()
    print(f"Ensemble MAE: {ens_mae:.4f}, RMSE: {ens_rmse:.4f}")

    # Create Ensemble Artifacts
    # 1. Create Predictions DataFrame
    ens_preds_df = p1_df.copy()
    ens_preds_df.loc[common_idx, "y_pred_f"] = ensemble_pred
    # Note: Sigma/LeadHours/etc are copied from p1 which is fine for now

    # 2. Compute full metrics object
    from tempdata.eval.metrics import compute_forecast_metrics, EvalMetrics
    f_met = compute_forecast_metrics(ens_preds_df)
    ens_metrics = EvalMetrics(forecast=f_met, calibration=None, slices=None)

    # 3. Save Artifacts for Ensemble
    ens_config = EvalConfig(
        run_name="ensemble_avg",
        station_ids=[STATION],  # Reuse global
        start_date_local=START_DATE,
        end_date_local=END_DATE,
        split=base_config.split,
        model=ModelConfig(type="ensemble_mean")
    )

    print("Saving Ensemble artifacts...")
    artifacts = write_model_artifacts(
        config=ens_config,
        metrics=ens_metrics,
        predictions_df=ens_preds_df,
        run_dir=multimodel_result.run_path,
        model_name="Ensemble"
    )

    # 4. Update Result and Comparison
    # Add to results dict
    results["Ensemble"] = EvalResult(
        run_id=multimodel_result.run_id,
        config=ens_config,
        predictions_df=ens_preds_df,
        metrics=ens_metrics,
        artifacts=artifacts
    )

    # Re-generate comparison summary
    updated_comparison = write_comparison_summary(
        results=results,
        run_dir=multimodel_result.run_path
    )

    print("Updated Comparison:")
    print(pd.DataFrame(updated_comparison["models"]).set_index("model").sort_values("mae"))


    # Visualize
    plt.figure(figsize=(10, 5))
    subset = slice(0, 100) # First 100 days
    plt.plot(y_true.iloc[subset].values, label="Actual", color="black", alpha=0.5)
    plt.plot(p1.iloc[subset].values, label="Ridge", alpha=0.7)
    plt.plot(p2.iloc[subset].values, label="kNN", alpha=0.7)
    plt.plot(ensemble_pred.iloc[subset].values, label="Ensemble", linestyle="--", color="red")
    plt.legend()
    plt.title("Model Comparison (First 100 Test Days)")
    plt.show()
