In [3]:
import pandas as pd
import numpy as np
from scipy.optimize import nnls
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# ---------- Load & index ----------
df = pd.read_csv("kaggle_clustering_student_version.csv")
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["date"]      = df["timestamp"].dt.date
df["hour"]      = df["timestamp"].dt.hour
df["dow"]       = df["timestamp"].dt.dayofweek
df["is_weekend"]= df["dow"] >= 5
df["month"]     = df["timestamp"].dt.month
df["kaggle_id"] = df["household_id"].astype(str) + "_" + df["timestamp"].dt.strftime("%Y-%m-%d %H")

# ---------- Handy constants ----------
NIGHT_HOURS = [2,3,4,5]
EV_WD_HOURS = [20,21,22,23,0,1]
EV_WE_HOURS = [11,12,13,14,15,16]
HEAT_MORN   = [6,7,8,9]
HEAT_EVE    = [17,18,19,20,21]
MAY_MONTH   = 5

def safe_pivot(pdf, value_col):
    # Robust pivot: average duplicates, keep all hours, drop incomplete days
    agg = (pdf.groupby(["date","hour"], as_index=False)[value_col].mean())
    mat = (agg.pivot(index="date", columns="hour", values=value_col)
              .reindex(columns=range(24)))
    return mat

In [4]:
# Try a few quantiles per household; pick the one minimizing *night* residual MSE
BASELINE_QS = [0.10, 0.15, 0.20]

def pick_baseline_for_house(pdf):
    best_q, best_b, best_err = None, None, 1e9
    for q in BASELINE_QS:
        night_vals = pdf.loc[pdf["hour"].isin(NIGHT_HOURS), "consumption_kWh"].values
        if len(night_vals) < 4:
            night_vals = pdf["consumption_kWh"].values
        b = float(np.quantile(night_vals, q))
        b = max(b, 0.01)
        err = np.mean(np.clip(night_vals - b, 0, None)**2)  # night residual MSE
        if err < best_err:
            best_q, best_b, best_err = q, b, err
    return pd.Series({"baseline_kWh": best_b, "q": best_q})

baseline_df = df.groupby("household_id").apply(pick_baseline_for_house).reset_index()
baseline_map = dict(zip(baseline_df["household_id"], baseline_df["baseline_kWh"]))

df["always_on_kWh"] = df["household_id"].map(baseline_map)
df["residual_kWh"]  = (df["consumption_kWh"] - df["always_on_kWh"]).clip(lower=0.0)

  baseline_df = df.groupby("household_id").apply(pick_baseline_for_house).reset_index()


In [5]:
# 1) Distribution of baselines
b = baseline_df["baseline_kWh"].describe()
print("Baseline describe:\n", b)

# 2) Night residual should be small (median ~0.0–0.05)
night = df[df["hour"].isin(NIGHT_HOURS)]
night_resid = (night["consumption_kWh"] - night["always_on_kWh"]).clip(lower=0)
print("Night residual kWh — mean:", night_resid.mean(), " median:", night_resid.median())

# 3) Sanity: share of hours where baseline > consumption (should be tiny)
neg_share = (df["consumption_kWh"] < df["always_on_kWh"]).mean()
print("Share of hours with baseline > total (should be < 1%):", f"{100*neg_share:.2f}%")

Baseline describe:
 count    200.000000
mean       0.235872
std        0.092814
min        0.078180
25%        0.167225
50%        0.225470
75%        0.285400
max        0.559780
Name: baseline_kWh, dtype: float64
Night residual kWh — mean: 0.37791210271739134  median: 0.2062
Share of hours with baseline > total (should be < 1%): 16.74%


In [6]:
def window_stats(pdf, hours):
    mat = safe_pivot(pdf, "residual_kWh")
    if mat is None or mat.empty:
        return np.zeros(6)
    vals = mat[hours].fillna(0.0).values
    day_sum = vals.sum(axis=1)
    day_p95 = np.percentile(vals, 95, axis=1)
    day_max = vals.max(axis=1)
    # hours above fixed levels (works well on this synthetic set)
    return np.array([
        day_sum.mean(), day_sum.std(),
        day_p95.mean(), day_max.mean(),
        (vals>1.2).sum(axis=1).mean(),
        (vals>2.0).sum(axis=1).mean()
    ])

def ev_feature_row(pdf):
    wd = pdf[~pdf["is_weekend"]]
    we = pdf[pdf["is_weekend"]]
    f_wd = window_stats(wd, EV_WD_HOURS)
    f_we = window_stats(we, EV_WE_HOURS)
    # evening vs midday contrast (adds scale invariance)
    eve = pdf.loc[pdf["hour"].isin([20,21,22,23,0,1]), "residual_kWh"].mean()
    day = pdf.loc[pdf["hour"].isin([10,11,12,13,14,15]), "residual_kWh"].mean()
    contrast = 0.0 if not np.isfinite(eve - day) else (eve - day)
    return np.concatenate([f_wd, f_we, [contrast]])

# Build feature matrix
rows, ids = [], []
for hid, pdf in df.groupby("household_id"):
    rows.append(ev_feature_row(pdf))
    ids.append(hid)
X = np.vstack(rows)
sc = StandardScaler()
Xz = sc.fit_transform(X)

# KMeans(2) and pick the EV label by higher evening contrast (last feature)
km = KMeans(n_clusters=2, n_init=50, random_state=42).fit(Xz)
labels = km.labels_
grp_eve = [X[labels==g][:,-1].mean() if (labels==g).any() else -1e9 for g in [0,1]]
ev_label = int(np.argmax(grp_eve))
ev_owner = {hid: (lab==ev_label) for hid,lab in zip(ids, labels)}
df["is_ev_owner"] = df["household_id"].map(ev_owner)
print("EV owners detected:", int(df.groupby("household_id")["is_ev_owner"].first().sum()))

EV owners detected: 55


In [7]:
# 1) EV owners share (10–40% plausible depending on synthetic set)
ev_share = np.mean(list(ev_owner.values()))
print(f"EV owner share: {100*ev_share:.1f}%")

# 2) Peak timing check: EV owners should peak at WD evenings and WE midday
ev_df = df[df["is_ev_owner"]]
wd = ev_df[~ev_df["is_weekend"]].groupby("hour")["residual_kWh"].mean()
we = ev_df[ ev_df["is_weekend"]].groupby("hour")["residual_kWh"].mean()
print("EV WD top-3 hours:", wd.sort_values(ascending=False).head(3))
print("EV WE top-3 hours:", we.sort_values(ascending=False).head(3))

EV owner share: 27.5%
EV WD top-3 hours: hour
22    1.219794
21    1.219478
23    1.208143
Name: residual_kWh, dtype: float64
EV WE top-3 hours: hour
17    1.195985
23    1.192592
18    1.179850
Name: residual_kWh, dtype: float64


In [8]:
# Coldness proxy: average residual across all households per day, normalized
daily_mean_resid = df.groupby("date")["residual_kWh"].mean()
cmin, cmax = daily_mean_resid.min(), daily_mean_resid.max()
coldness = (daily_mean_resid - cmin) / (cmax - cmin + 1e-12)
coldness.name = "coldness_idx"
df = df.merge(coldness.to_frame(), left_on="date", right_index=True, how="left")

# Learn OTHER template from warm weekdays (exclude May to avoid “no heating” artefacts)
warm_cut = coldness.quantile(0.35)
other_source = df[(~df["is_weekend"]) & (df["month"]!=MAY_MONTH) & (df["coldness_idx"]<=warm_cut)]
mat_other = safe_pivot(other_source, "residual_kWh").dropna()
other_template = mat_other.median(axis=0).values
other_template = other_template / (other_template.sum() + 1e-12)

# Learn HEATING template from cold weekdays (top quartile of coldness)
cold_cut = coldness.quantile(0.75)
heat_source = df[(~df["is_weekend"]) & (df["coldness_idx"]>=cold_cut)]
mat_heat = safe_pivot(heat_source, "residual_kWh").dropna()
# isolate heating-ish shape by removing a scaled copy of "other" first (simple projection)
if not mat_heat.empty:
    H_raw = mat_heat.median(axis=0).values
    # Remove daytime “other” leakage with a nonnegative scalar alpha
    alpha = max(0.0, min(1.0, np.dot(H_raw, other_template)/np.dot(other_template, other_template)))
    H_shape = np.clip(H_raw - alpha*other_template, 0, None)
else:
    H_shape = np.zeros(24)

# if heating shape too sparse, fall back to AM/PM bumps
if H_shape.sum() < 1e-6:
    H_shape = np.zeros(24)
    for h in HEAT_MORN: H_shape[h] += 1
    for h in HEAT_EVE:  H_shape[h] += 1.2
heating_template = H_shape / (H_shape.sum() + 1e-12)

In [9]:
def ev_preallocate(day_resid, is_weekend, is_owner):
    ev = np.zeros(24, dtype=float)
    if not is_owner:
        return ev
    win = EV_WE_HOURS if is_weekend else EV_WD_HOURS
    vals = np.array([day_resid[h] for h in win], float)
    if len(vals)==0:
        return ev
    # percentile threshold per-day + floor
    thr = max(np.percentile(vals, 85), 1.1)  # tweak 80–90 and 1.0–1.4 if needed
    for h in win:
        excess = day_resid[h] - thr
        if excess > 0:
            ev[h] = excess
    return np.minimum(ev, day_resid)

rows = []
for (hid, day), g in df.groupby(["household_id","date"]):
    is_we = bool(g["is_weekend"].iloc[0])
    ao    = float(baseline_map[hid])
    y_tot = g.set_index("hour")["consumption_kWh"].reindex(range(24)).values
    y_res = np.maximum(y_tot - ao, 0.0)

    # 1) EV spikes first
    is_owner = bool(df.loc[g.index, "is_ev_owner"].iloc[0])
    ev_part = ev_preallocate(y_res, is_we, is_owner)
    rem = np.maximum(y_res - ev_part, 0.0)

    # 2) NNLS for [heating, other] on remainder
    #    Scale heating by coldness (day-level) AFTER fit (amplitude), keep shape fixed
    cold = float(df.loc[g.index, "coldness_idx"].iloc[0])
    X = np.vstack([heating_template, other_template]).T  # (24x2), columns sum to 1
    coeffs, _ = nnls(X, rem)
    heat = coeffs[0] * heating_template
    oth  = coeffs[1] * other_template

    # 3) No heating in May
    if pd.Timestamp(day).month == MAY_MONTH:
        heat[:] = 0.0

    # 4) Per-hour conservation
    var = heat + ev_part + oth
    need = np.maximum(y_tot - ao, 0.0)
    scale = np.divide(need, var, out=np.ones_like(need), where=var>1e-9)
    heat = np.clip(heat * scale, 0, None)
    ev   = np.clip(ev_part * scale, 0, None)
    oth  = np.clip(oth * scale, 0, None)

    # final small rebalance into "other"
    recon = ao + heat + ev + oth
    fix   = (y_tot - recon)
    oth   = np.clip(oth + fix, 0, None)

    out = pd.DataFrame({
        "household_id": hid,
        "date": day,
        "hour": np.arange(24),
        "heating_kWh": heat,
        "ev_kWh": ev,
        "always_on_kWh": np.full(24, ao),
        "other_kWh": oth
    })
    rows.append(out)

pred = pd.concat(rows, ignore_index=True)
pred["timestamp"] = pd.to_datetime(pred["date"].astype(str)) + pd.to_timedelta(pred["hour"], unit="h")
pred["kaggle_id"] = pred["household_id"].astype(str) + "_" + pred["timestamp"].dt.strftime("%Y-%m-%d %H")

submit = pred[["kaggle_id","heating_kWh","ev_kWh","always_on_kWh","other_kWh"]].sort_values("kaggle_id")
submit.to_csv("submission_stepwise.csv", index=False)
print("Saved submission_stepwise.csv")

Saved submission_stepwise.csv


In [10]:
# Hourly conservation check
chk = df[["kaggle_id","consumption_kWh"]].merge(submit, on="kaggle_id", how="left")
tot_pred = chk[["heating_kWh","ev_kWh","always_on_kWh","other_kWh"]].sum(axis=1)
diff = chk["consumption_kWh"] - tot_pred
print("Abs diff mean:", diff.abs().mean(), "  95p:", diff.abs().quantile(0.95))

# Residual vs coldness correlation — target ~0 to +0.2 (previously 0.64)
df_pred = df.merge(submit.rename(columns={
    "heating_kWh":"pred_heat","ev_kWh":"pred_ev","always_on_kWh":"pred_ao","other_kWh":"pred_other"
}), on="kaggle_id", how="left")
df_pred["reconstructed"] = df_pred[["pred_heat","pred_ev","pred_ao","pred_other"]].sum(axis=1)
df_pred["resid_bias"] = df_pred["consumption_kWh"] - df_pred["reconstructed"]
daily_bias = df_pred.groupby("date")["resid_bias"].mean().to_frame().merge(
    df_pred.groupby("date")["coldness_idx"].first(), left_index=True, right_index=True)
corr = np.corrcoef(daily_bias["resid_bias"], daily_bias["coldness_idx"])[0,1]
print("Residual vs coldness correlation:", round(float(corr), 3))

# Weekend/weekday ratios (sanity)
subv = submit.copy()
subv["timestamp"] = pd.to_datetime(subv["kaggle_id"].str.split("_").str[1])
subv["dow"] = subv["timestamp"].dt.dayofweek
subv["is_weekend"] = subv["dow"] >= 5
wk = subv[~subv["is_weekend"]].mean(numeric_only=True)
we = subv[ subv["is_weekend"]].mean(numeric_only=True)
print("Weekend/Weekday ratios:\n", (we/wk)[["heating_kWh","ev_kWh","always_on_kWh","other_kWh"]])

Abs diff mean: 0.011111069972826095   95p: 0.08344
Residual vs coldness correlation: 0.671
Weekend/Weekday ratios:
 heating_kWh      1.011676
ev_kWh           0.888260
always_on_kWh    1.000000
other_kWh        0.989902
dtype: float64


In [11]:
# ============================================================
# Improved Data-driven Residual Dictionary (Weekday/Weekend + Soft EV gate + Ensemble)
# ============================================================
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.optimize import nnls

RNG_SEEDS = [17, 42, 73]       # try 3–5 seeds for a small ensemble
N_CLUSTERS = 8                 # 6→8 often helps
NIGHT_HOURS = [2,3,4,5]
EV_WD_HOURS = [20,21,22,23,0,1]
EV_WE_HOURS = [11,12,13,14,15,16]

# -----------------------------
# Load + basic indexing
# -----------------------------
df = pd.read_csv("kaggle_clustering_student_version.csv")
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["date"]      = df["timestamp"].dt.date
df["hour"]      = df["timestamp"].dt.hour
df["dow"]       = df["timestamp"].dt.dayofweek
df["is_weekend"]= df["dow"] >= 5
df["kaggle_id"] = df["household_id"].astype(str) + "_" + df["timestamp"].dt.strftime("%Y-%m-%d %H")

# -----------------------------
# STEP 1 — Baseline (always-on) per household from night quantile
# -----------------------------
BASELINE_QS = [0.10, 0.15, 0.20]  # tiny grid; we pick the one minimizing *night* residual MSE

def pick_baseline(pdf):
    best = (1e9, 0.15, 0.1)
    nights = pdf.loc[pdf["hour"].isin(NIGHT_HOURS), "consumption_kWh"].values
    if len(nights) < 4: nights = pdf["consumption_kWh"].values
    for q in BASELINE_QS:
        b = max(0.01, float(np.quantile(nights, q)))
        err = np.mean(np.clip(nights - b, 0, None)**2)
        if err < best[0]: best = (err, q, b)
    return best[2]

baseline_map = df.groupby("household_id").apply(pick_baseline).to_dict()
df["always_on_kWh"] = df["household_id"].map(baseline_map)
df["residual_kWh"]  = (df["consumption_kWh"] - df["always_on_kWh"]).clip(lower=0)

# System “coldness” proxy (helps diagnostics / optional penalties)
daily_mean_resid = df.groupby("date")["residual_kWh"].mean()
cmin, cmax = daily_mean_resid.min(), daily_mean_resid.max()
coldness = ((daily_mean_resid - cmin) / (cmax - cmin + 1e-12)).rename("coldness_idx")
df = df.merge(coldness.to_frame(), left_on="date", right_index=True, how="left")

# -----------------------------
# STEP 2 — EV owner detection (scale-robust)
# -----------------------------
def safe_pivot(pdf, value_col="residual_kWh"):
    g = pdf.groupby(["date","hour"], as_index=False)[value_col].mean()
    mat = g.pivot(index="date", columns="hour", values=value_col).reindex(columns=range(24))
    return mat

def ev_features(pdf):
    wd = pdf[~pdf["is_weekend"]]; we = pdf[pdf["is_weekend"]]
    def window_feats(p, hours):
        mat = safe_pivot(p)
        if mat is None or mat.empty: return np.zeros(6)
        vals = mat[hours].fillna(0).values
        day_sum = vals.sum(axis=1)
        day_p95 = np.percentile(vals, 95, axis=1)
        day_max = vals.max(axis=1)
        return np.array([day_sum.mean(), day_sum.std(), day_p95.mean(), day_max.mean(),
                         (vals>1.2).sum(axis=1).mean(), (vals>2.0).sum(axis=1).mean()])
    f_wd = window_feats(wd, EV_WD_HOURS)
    f_we = window_feats(we, EV_WE_HOURS)
    eve = pdf.loc[pdf["hour"].isin(EV_WD_HOURS), "residual_kWh"].mean()
    day = pdf.loc[pdf["hour"].isin([10,11,12,13,14,15]), "residual_kWh"].mean()
    contrast = 0.0 if not np.isfinite(eve-day) else (eve-day)
    return np.concatenate([f_wd, f_we, [contrast]])

ev_rows, hh_ids = [], []
for hid, pdf in df.groupby("household_id"):
    ev_rows.append(ev_features(pdf)); hh_ids.append(hid)
X = np.vstack(ev_rows)
Xz = StandardScaler().fit_transform(X)
lab = KMeans(n_clusters=2, n_init=40, random_state=42).fit_predict(Xz)
grp_eve_contrast = [X[lab==g][:,-1].mean() if (lab==g).any() else -1e9 for g in [0,1]]
ev_label = int(np.argmax(grp_eve_contrast))
ev_owner = {hid: (l==ev_label) for hid,l in zip(hh_ids, lab)}
df["is_ev_owner"] = df["household_id"].map(ev_owner)

# -----------------------------
# STEP 3 — Dictionaries on residuals (weekday/weekend) + soft EV gate + ensemble
# -----------------------------
def collect_residual_days(pdf):
    mat = safe_pivot(pdf, "residual_kWh").dropna(how="any")
    return mat.values  # (N_days x 24)

def auto_map(centroids):
    """Return indices (heat_idx, ev_idx, other_idx). We exclude 'flat' because we model baseline separately."""
    C = centroids
    # Scoring
    def ev_score(c): return c[20:].mean() - c[:10].mean()
    def heat_score(c): return c[6:10].mean() + c[17:22].mean() - c.mean()
    def other_score(c): return c[9:16].mean()  # daytime
    # Avoid “flat” profiles (std very low)
    stds = C.std(axis=1)
    order = np.argsort(stds)  # low→high
    flat_candidates = set(order[:max(1, len(order)//N_CLUSTERS)])  # 1 low-std cluster disfavored
    idxs = list(range(C.shape[0]))
    # choose distinct maxima
    ev_idx   = max(idxs, key=lambda i: (ev_score(C[i]), -stds[i]))
    heat_idx = max([i for i in idxs if i!=ev_idx], key=lambda i: (heat_score(C[i]), -stds[i]))
    others   = [i for i in idxs if i not in (ev_idx, heat_idx)]
    other_idx= max(others, key=lambda i: (other_score(C[i]), -stds[i])) if others else heat_idx
    return heat_idx, ev_idx, other_idx

def learn_basis(days_24xN, seed):
    if days_24xN.shape[0] < 40:  # fallback small sample
        # generic shapes (normalized)
        heat = np.zeros(24);  heat[[6,7,8,9]] += 1; heat[[17,18,19,20,21]] += 1.2
        ev   = np.zeros(24);  ev[EV_WD_HOURS] += 1
        other= np.zeros(24);  other[[7,8,9,17,18]] += 1; other += 0.3
        H = heat/heat.sum(); E = ev/ev.sum(); O = other/other.sum()
        return np.vstack([H,E,O]).T  # (24x3)
    # standardize each day by its sum (shape over magnitude)
    X = days_24xN / (days_24xN.sum(axis=1, keepdims=True) + 1e-12)
    km = KMeans(n_clusters=N_CLUSTERS, n_init=30, random_state=seed)
    labels = km.fit_predict(X)
    C = km.cluster_centers_  # (k x 24), already normalized to ~unit sum
    # auto-map to (heating, EV, other)
    h_idx, e_idx, o_idx = auto_map(C)
    B = np.vstack([C[h_idx], C[e_idx], C[o_idx]]).T  # (24x3)
    # re-normalize columns exactly
    B = B / (B.sum(axis=0, keepdims=True) + 1e-12)
    return B

# Learn two bases per seed: weekday + weekend
bases = []  # list of dicts: {"wd": 24x3, "we": 24x3}
for seed in RNG_SEEDS:
    wd_days = collect_residual_days(df[~df["is_weekend"]])
    we_days = collect_residual_days(df[ df["is_weekend"]])
    B_wd = learn_basis(wd_days, seed)
    B_we = learn_basis(we_days, seed)
    bases.append({"wd": B_wd, "we": B_we})

# Soft EV gate: column multiplier for EV for non-owners (0.3–0.6 works well)
EV_PENALTY = 0.4

def fit_one_day(day_df, base_wd, base_we, is_ev_owner, baseline):
    is_we = bool(day_df["is_weekend"].iloc[0])
    B = base_we if is_we else base_wd         # (24x3) columns = [heat, ev, other]
    # apply soft penalty for EV if non-owner
    P = np.array([1.0, 1.0 if is_ev_owner else EV_PENALTY, 1.0])
    X = (B * P).T.T                            # column scaling
    # target residual
    y_total = day_df.set_index("hour")["consumption_kWh"].reindex(range(24)).values
    y = np.maximum(y_total - baseline, 0.0)
    # NNLS
    a, _ = nnls(X, y)
    comps = B * a  # use *unpenalized* basis to keep original shapes
    # per-hour scale so residual sums match exactly
    var = comps.sum(axis=1)
    scale = np.divide(y, var, out=np.ones_like(y), where=var>1e-9)
    comps = np.clip(comps * scale[:,None], 0, None)
    # return heating, ev, other
    return comps[:,0], comps[:,1], comps[:,2]

# -----------------------------
# Inference with ensemble + hourwise conservation
# -----------------------------
pieces = []
for (hid, day), g in df.groupby(["household_id","date"]):
    baseline = baseline_map[hid]
    evflag   = bool(ev_owner.get(hid, False))
    # ensemble the three bases
    heats = []; evs = []; others = []
    for b in bases:
        h, e, o = fit_one_day(g, b["wd"], b["we"], evflag, baseline)
        heats.append(h); evs.append(e); others.append(o)
    heat  = np.mean(heats, axis=0)
    ev    = np.mean(evs, axis=0)
    other = np.mean(others, axis=0)
    ao    = np.full(24, baseline)

    # strict hour-wise conservation
    total = g.set_index("hour")["consumption_kWh"].reindex(range(24)).values
    var   = heat + ev + other
    need  = np.maximum(total - ao, 0.0)
    scl   = np.divide(need, var, out=np.ones_like(need), where=var>1e-9)
    heat  = np.clip(heat * scl, 0, None)
    ev    = np.clip(ev   * scl, 0, None)
    other = np.clip(other* scl, 0, None)
    # tiny numeric fix
    recon = ao + heat + ev + other
    other = np.clip(other + (total - recon), 0, None)

    df_out = pd.DataFrame({
        "household_id": hid,
        "date": day,
        "hour": np.arange(24),
        "heating_kWh": heat,
        "ev_kWh": ev,
        "always_on_kWh": ao,
        "other_kWh": other
    })
    pieces.append(df_out)

pred = pd.concat(pieces, ignore_index=True)
pred["timestamp"] = pd.to_datetime(pred["date"].astype(str)) + pd.to_timedelta(pred["hour"], unit="h")
pred["kaggle_id"] = pred["household_id"].astype(str) + "_" + pred["timestamp"].dt.strftime("%Y-%m-%d %H")

submit = pred[["kaggle_id","heating_kWh","ev_kWh","always_on_kWh","other_kWh"]].sort_values("kaggle_id")
submit.to_csv("submission_residual_dict_wdwe_softEV_ensemble.csv", index=False)
print("Saved submission_residual_dict_wdwe_softEV_ensemble.csv")

  baseline_map = df.groupby("household_id").apply(pick_baseline).to_dict()


Saved submission_residual_dict_wdwe_softEV_ensemble.csv
