In [2]:
# Table 1: Sample & panel structure
import pandas as pd
from pathlib import Path

IN_PATH  = "data/clean/borough_final/voa_panel_with_station_band_with_switch_boroughfinal_with_walkbands.parquet"
OUT_CSV  = "work/outputs/descriptive/tab_sample_structure.csv"

# 只读所需列，避免加载几何
use_cols = ["avd","uarn","n_t"]
df = pd.read_parquet(IN_PATH, columns=use_cols)

# 只保留三期（2008/2015/2021）
years = [2008, 2015, 2021]
df = df[df["avd"].isin(years)].copy()

# 统一类型
df["avd"] = df["avd"].astype(int)
# 确保 n_t 为数值
df["n_t"] = pd.to_numeric(df["n_t"], errors="coerce")

# U_total：三期并集的唯一物业数
U_total = df["uarn"].nunique()

# 按 uarn 去重以获取“物业层级”的 n_t
uarn_level = df.drop_duplicates(subset="uarn")[["uarn","n_t"]]

# N_ntk_total：整套样本（物业层级）的 n_t=1/2/3 数量
N_nt1_total = int((uarn_level["n_t"] == 1).sum())
N_nt2_total = int((uarn_level["n_t"] == 2).sum())
N_nt3_total = int((uarn_level["n_t"] == 3).sum())

# 对应比例
share_nt1_total = N_nt1_total / U_total if U_total else 0.0
share_nt2_total = N_nt2_total / U_total if U_total else 0.0
share_nt3_total = N_nt3_total / U_total if U_total else 0.0

# balanced_total：n_t=3 的物业总数
balanced_total = N_nt3_total

rows = []
for y in years:
    sub = df[df["avd"] == y]
    # N_year：该年出现的唯一物业数
    N_year = sub["uarn"].nunique()
    # coverage_year：N_year / U_total
    coverage_year = N_year / U_total if U_total else 0.0
    # balanced_in_year：该年出现且 n_t=3 的唯一物业数
    balanced_in_year = sub.loc[sub["n_t"] == 3, "uarn"].nunique()
    # share_balanced_in_year：balanced_in_year / N_year
    share_balanced_in_year = balanced_in_year / N_year if N_year else 0.0

    rows.append({
        "year": y,
        "U_total": U_total,
        "N_year": N_year,
        "coverage_year": coverage_year,
        "balanced_in_year": balanced_in_year,
        "share_balanced_in_year": share_balanced_in_year,
        "N_nt1_total": N_nt1_total,
        "N_nt2_total": N_nt2_total,
        "N_nt3_total": N_nt3_total,
        "share_nt1_total": share_nt1_total,
        "share_nt2_total": share_nt2_total,
        "share_nt3_total": share_nt3_total,
    })

# overall 行
rows.append({
    "year": "overall",
    "U_total": U_total,
    "N_year": U_total,                    # overall 行写成 U_total
    "coverage_year": 1.0,                 # overall 行为 1
    "balanced_in_year": balanced_total,   # overall 行为 balanced_total
    "share_balanced_in_year": (balanced_total / U_total) if U_total else 0.0,
    "N_nt1_total": N_nt1_total,
    "N_nt2_total": N_nt2_total,
    "N_nt3_total": N_nt3_total,
    "share_nt1_total": share_nt1_total,
    "share_nt2_total": share_nt2_total,
    "share_nt3_total": share_nt3_total,
})

tab = pd.DataFrame(rows, columns=[
    "year","U_total","N_year","coverage_year",
    "balanced_in_year","share_balanced_in_year",
    "N_nt1_total","N_nt2_total","N_nt3_total",
    "share_nt1_total","share_nt2_total","share_nt3_total"
])

# 输出
Path(OUT_CSV).parent.mkdir(parents=True, exist_ok=True)
tab.to_csv(OUT_CSV, index=False)

# 可选：查看结果
tab


Unnamed: 0,year,U_total,N_year,coverage_year,balanced_in_year,share_balanced_in_year,N_nt1_total,N_nt2_total,N_nt3_total,share_nt1_total,share_nt2_total,share_nt3_total
0,2008,102542,90084,0.878508,78098,0.866946,12953,11491,78098,0.126319,0.112061,0.76162
1,2015,102542,90217,0.879805,78098,0.865668,12953,11491,78098,0.126319,0.112061,0.76162
2,2021,102542,89928,0.876987,78098,0.86845,12953,11491,78098,0.126319,0.112061,0.76162
3,overall,102542,102542,1.0,78098,0.76162,12953,11491,78098,0.126319,0.112061,0.76162
