In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# =========================================
# 0. 准备原始 panel：每年每个 ZIP 的累计量 N_{z,t}
# =========================================
panel = pd.read_csv("../data/zip_year_counts.csv")

panel = panel.rename(columns={"ZIP Code": "ZIP"})
panel["ZIP"] = panel["ZIP"].astype(int)
panel = panel.sort_values(["ZIP", "year"])

panel["N"] = panel["cum_original_count"]

# 只保留 N>0 的记录，避免 log 出 NaN / -inf
panel = panel[panel["N"] > 0].copy()

zip_list = panel["ZIP"].unique()
target_year = 2025

forecast_rows = []
all_residuals = []

for z in zip_list:
    df_z = panel[panel["ZIP"] == z].copy()
    df_z = df_z.sort_values("year")

    # ---- 只用该 ZIP 最近三年的数据 ----
    # 找出这个 ZIP 有数据的年份
    years_z = sorted(df_z["year"].unique())
    # 取该 ZIP 最后 3 个年份（如果只有 2 年就用 2 年）
    use_years = years_z[-3:]
    df_z_recent = df_z[df_z["year"].isin(use_years)].copy()
    df_z_recent = df_z_recent.sort_values("year")

    # 至少要有 2 个年份才能拟合直线
    if df_z_recent.shape[0] < 2:
        continue

    # 用“最近三年”里的最早那一年作为 t=0
    t0_z = df_z_recent["year"].min()
    df_z_recent["t"] = df_z_recent["year"] - t0_z

    y = np.log(df_z_recent["N"].values)
    X = sm.add_constant(df_z_recent["t"].values)  # [1, t]

    model_z = sm.OLS(y, X)
    res_z = model_z.fit()

    # 收集“最近三年”上的残差（log 空间）
    all_residuals.append(y - res_z.fittedvalues)

    # 用这条线外推到 2025：
    t_future = target_year - t0_z
    log_hat_2025 = res_z.params[0] + res_z.params[1] * t_future
    Nhat_2025 = float(np.exp(log_hat_2025))

    forecast_rows.append({
        "ZIP Code": z,
        "Nhat_2025": Nhat_2025
    })

# 组合所有 ZIP 的残差，用来算 sigma
resid_all = np.concatenate(all_residuals)
sigma = resid_all.std(ddof=1)
print("Global residual std (log space) sigma =", sigma)

forecast_df = pd.DataFrame(forecast_rows)
print("Per-ZIP 2025 point forecast sample:")
print(forecast_df.head())

Global residual std (log space) sigma = 0.12707783351781599
Per-ZIP 2025 point forecast sample:
   ZIP Code    Nhat_2025
0     10001  3044.681503
1     10002  2529.962195
2     10003  4599.270115
3     10004  3603.829945
4     10005  4073.443082


In [2]:
# =========================================
# 1. 用 sigma 构造三个 scenario 的 multiplier
#    这里取 k=1，可以按需要调成 0.5 或 1.5
# =========================================
k = 2

mult_low  = np.exp(-k * sigma)  # 所有 ZIP 共用
mult_high = np.exp(+k * sigma/2)

print("Scenario multipliers (low, mid, high):",
      mult_low, 1.0, mult_high)

# 场景权重（你可以按需要改）
weights = {
    1: 0.306058,   # low
    2: 0.311044,   # medium
    3: 0.382898    # high
}

# =========================================
# 2. 每个 ZIP × 三个 scenario 的 2025 demand
# =========================================
rows = []

for _, row in forecast_df.iterrows():
    z = row["ZIP Code"]
    base = row["Nhat_2025"]

    # Scenario 1: low
    rows.append({
        "ZIP Code": z,
        "scenario": 1,
        "demand_2025": base * mult_low,
        "scenario_weight": weights[1]
    })

    # Scenario 2: medium
    rows.append({
        "ZIP Code": z,
        "scenario": 2,
        "demand_2025": base,
        "scenario_weight": weights[2]
    })

    # Scenario 3: high
    rows.append({
        "ZIP Code": z,
        "scenario": 3,
        "demand_2025": base * mult_high,
        "scenario_weight": weights[3]
    })

scenario_registration_2025 = pd.DataFrame(rows)

print("scenario_registration_2025 sample:")
print(scenario_registration_2025.head())

scenario_registration_2025.to_csv(
    "../data/adaptive_1204.csv", index=False
)


Scenario multipliers (low, mid, high): 0.7755710618009694 1.0 1.135505394721468
scenario_registration_2025 sample:
   ZIP Code  scenario  demand_2025  scenario_weight
0   10001.0         1  2361.366866         0.306058
1   10001.0         2  3044.681503         0.311044
2   10001.0         3  3457.252272         0.382898
3   10002.0         1  1962.165466         0.306058
4   10002.0         2  2529.962195         0.311044


In [3]:
scenario_registration_2025.shape

(5400, 4)