In [None]:
monthlist = (
    [f"{m:02d}" for m in range(6, 13)]
    if y == 2008
    else [f"{m:02d}" for m in range(1, 13)]
)

def read_penetration(path):
    df = pd.read_csv(
        path,
        skiprows=1,
        names=[
            "state","county","fips_state","fips_cnty","fips",
            "ssa_state","ssa_cnty","ssa","eligibles","enrolled","penetration"
        ],
        dtype={
            "state":"string","county":"string",
            "fips_state":"Int64","fips_cnty":"Int64","fips":"float64",
            "ssa_state":"Int64","ssa_cnty":"Int64","ssa":"float64",
            "eligibles":"string","enrolled":"string","penetration":"string"
        },
        na_values=["", "NA", "*", "-", "--"],
    )

    for c in ["eligibles","enrolled","penetration"]:
        df[c] = (
            df[c]
            .astype("string")
            .str.replace(",", "", regex=False)
            .str.extract(r"(-?\d+(?:\.\d+)?)")[0]
            .astype("float64")
        )

    return df

ma_penetration = pd.concat(
    [
        read_penetration(
            f"data/input/ma/penetration/Extracted Data/State_County_Penetration_MA_{y}_{m}.csv"
        ).assign(month=int(m), year=y)
        for m in monthlist
    ],
    ignore_index=True
)

ma_penetration = (
    ma_penetration
    .sort_values(["state","county","month"])
)

ma_penetration["fips"] = (
    ma_penetration
    .groupby(["state","county"], dropna=False)["fips"]
    .transform(lambda s: s.ffill().bfill())
)

final_penetration = (
    ma_penetration
    .sort_values(["fips","state","county","year","month"])
    .groupby(["fips","state","county","year"], dropna=False)
    .agg(
        n_elig=("eligibles", lambda s: s.notna().sum()),
        avg_eligibles=("eligibles", lambda s: s.mean(skipna=True)),
        sd_eligibles=("eligibles", lambda s: s.std(skipna=True)),
        min_eligibles=("eligibles", "min"),
        max_eligibles=("eligibles", "max"),
        first_eligibles=("eligibles", lambda s: s.dropna().iloc[0] if s.notna().any() else pd.NA),
        last_eligibles=("eligibles", lambda s: s.dropna().iloc[-1] if s.notna().any() else pd.NA),

        n_enrol=("enrolled", lambda s: s.notna().sum()),
        avg_enrolled=("enrolled", lambda s: s.mean(skipna=True)),
        sd_enrolled=("enrolled", lambda s: s.std(skipna=True)),
        min_enrolled=("enrolled", "min"),
        max_enrolled=("enrolled", "max"),
        first_enrolled=("enrolled", lambda s: s.dropna().iloc[0] if s.notna().any() else pd.NA),
        last_enrolled=("enrolled", lambda s: s.dropna().iloc[-1] if s.notna().any() else pd.NA),

        ssa=("ssa", "last"),
    )
    .reset_index()
)

# Usage
# Set year of interest
#y = 2008  # or any other year supported by the data


# Inspect results
#print(ma_penetration.head())
#print(final_penetration.head())