In [2]:
# Cell 1 — Imports & paths (robust)
import os, json, joblib
import pandas as pd
import numpy as np

# If you're running this notebook from /notebooks, these relative paths work.
PHASE5 = "../data/processed/phase5_dataset.parquet"
MODEL_XGB = "../models/solar_xgb.joblib"
MODEL_RIDGE = "../models/solar_ridge.joblib"
FEATURE_LIST_JSON = "../models/feature_list.json"
OUT = "../data/processed/suitability_solar.parquet"

os.makedirs(os.path.dirname(OUT), exist_ok=True)
print("CWD:", os.getcwd())
print("Expecting:", PHASE5, MODEL_XGB, MODEL_RIDGE, FEATURE_LIST_JSON)



CWD: /Users/jaidevreddy/Documents/Documents - Jaidev’s MacBook Pro/projects/blr-renewables/notebooks
Expecting: ../data/processed/phase5_dataset.parquet ../models/solar_xgb.joblib ../models/solar_ridge.joblib ../models/feature_list.json


In [3]:
# Cell 2 — Load Phase 5 dataset
df = pd.read_parquet(PHASE5).copy()
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["ZoneID","date"]).reset_index(drop=True)

print("Phase5 shape:", df.shape)
display(df.head(3))


Phase5 shape: (212073, 23)


Unnamed: 0,ZoneID,date,energy_kwh,year,month,doy,dow,is_weekend,roll7_mean,roll7_std,...,anom_month_kwh,y_lag1,y_lag7,y_lag14,y_mean_7d,y_std_7d,y_min_7d,y_max_7d,sin_doy,cos_doy
0,BLR-0001,2023-01-02,57.866284,2023,1,2,0,False,,,...,7.050867,,,,,,,,0.034398,0.999408
1,BLR-0001,2023-01-03,53.82878,2023,1,3,1,False,,,...,3.013362,57.866284,,,,,,,0.051584,0.998669
2,BLR-0001,2023-01-04,54.480626,2023,1,4,2,False,55.391897,1.769778,...,3.665208,53.82878,,,,,,,0.068755,0.997634


In [4]:
# Cell 3 — Determine TARGET and infer features if feature_list.json is missing
TARGET = "energy_kwh"

# Ensure ../models directory exists
os.makedirs(os.path.dirname(FEATURE_LIST_JSON), exist_ok=True)

# Try loading saved feature list; if missing, infer from df.
FEATS = None
if os.path.exists(FEATURE_LIST_JSON):
    FEATS = json.load(open(FEATURE_LIST_JSON))
    print("Loaded feature list from JSON. #features =", len(FEATS))
else:
    # Infer: numeric columns excluding keys + target
    non_feats = {"ZoneID", "date", TARGET}
    FEATS = [c for c in df.columns 
             if c not in non_feats and pd.api.types.is_numeric_dtype(df[c])]
    print("Inferred feature list from Phase 5 dataset. #features =", len(FEATS))
    # Save for later convenience
    with open(FEATURE_LIST_JSON, "w") as f:
        json.dump(FEATS, f, indent=2)
        print("Saved inferred feature list to", FEATURE_LIST_JSON)



Loaded feature list from JSON. #features = 20


In [5]:
# Cell 4 — Load a model if available (XGB preferred, else Ridge), else fall back
model_path = None
model = None

if os.path.exists(MODEL_XGB):
    model_path = MODEL_XGB
elif os.path.exists(MODEL_RIDGE):
    model_path = MODEL_RIDGE

if model_path:
    model = joblib.load(model_path)
    print("Loaded model:", os.path.basename(model_path))
else:
    print("No trained model found. Will fall back to baseline suitability from historical target.")


Loaded model: solar_xgb.joblib


In [6]:
# Cell 5 — Predict daily y_hat IF model exists; otherwise use historical target as proxy
df_use = df.copy()

if model is not None:
    # Use model predictions
    missing_feats = [c for c in FEATS if c not in df_use.columns]
    if missing_feats:
        raise ValueError(f"Missing required features in Phase 5 dataset: {missing_feats}")
    df_use["y_hat"] = model.predict(df_use[FEATS])
    src = "model_pred"
else:
    # Baseline suitability from the actuals in Phase 5 (historical production)
    if TARGET not in df_use.columns:
        raise ValueError(f"TARGET column '{TARGET}' not found in Phase 5 dataset.")
    df_use["y_hat"] = df_use[TARGET].astype(float)
    src = "historical_target"

print("Using y_hat source:", src)
display(df_use[["ZoneID","date","y_hat"]].head(3))

Using y_hat source: model_pred


Unnamed: 0,ZoneID,date,y_hat
0,BLR-0001,2023-01-02,57.790169
1,BLR-0001,2023-01-03,53.879498
2,BLR-0001,2023-01-04,54.490379


In [7]:
# Cell 6 — Aggregate to annual kWh per zone (multi-year average if multiple years)
annual = (df_use
          .assign(year=df_use["date"].dt.year)
          .groupby(["ZoneID","year"])["y_hat"].sum()
          .groupby("ZoneID").mean()  # average across years if >1
          .rename("annual_kwh")
          .reset_index())

print("Annual kWh per zone:", annual.shape)
display(annual.head(5))


Annual kWh per zone: (223, 2)


Unnamed: 0,ZoneID,annual_kwh
0,BLR-0001,14691.605469
1,BLR-0002,14692.210938
2,BLR-0003,14693.567383
3,BLR-0004,14694.875
4,BLR-0005,14697.036133


In [8]:
# Cell 7 — Normalize to a 0–100 suitability score (percentile bins)
q = np.quantile(annual["annual_kwh"], np.linspace(0, 1, 101))  # 101 edges -> scores 0..100

def to_pct01(x, q_edges):
    # which percentile bin (0..100) does x fall in?
    return int(np.searchsorted(q_edges, x, side="right") - 1)

annual["score_0_100"] = annual["annual_kwh"].apply(lambda v: to_pct01(v, q))
annual["score_source"] = src  # 'model_pred' or 'historical_target'

display(annual.head(10))


Unnamed: 0,ZoneID,annual_kwh,score_0_100,score_source
0,BLR-0001,14691.605469,0,model_pred
1,BLR-0002,14692.210938,0,model_pred
2,BLR-0003,14693.567383,0,model_pred
3,BLR-0004,14694.875,1,model_pred
4,BLR-0005,14697.036133,4,model_pred
5,BLR-0006,14697.797852,5,model_pred
6,BLR-0007,14695.71875,2,model_pred
7,BLR-0008,14696.729492,3,model_pred
8,BLR-0009,14820.323242,64,model_pred
9,BLR-0010,14694.479492,1,model_pred


In [9]:
# Cell 8 — Save
annual.to_parquet(OUT, index=False)
print("Saved:", OUT)


Saved: ../data/processed/suitability_solar.parquet
