In [14]:
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing

#load data
enroll = pd.read_csv("cleaned dataset/cleaned_enrollment_data.csv")
bio = pd.read_csv("cleaned dataset/cleaned_biometric_data.csv")
demo = pd.read_csv("cleaned dataset/cleaned_demographic_data.csv")


In [15]:
enroll["total_enrollment"] = enroll[["age_0_5", "age_5_17", "age_18_greater"]].sum(axis=1)
bio["total_bio"]=bio[["bio_age_5_17","bio_age_17_"]].sum(axis=1)
demo["total_demo"]=demo[["demo_age_5_17","demo_age_17_"]].sum(axis=1)
bio["revenue"]=bio["bio_age_17_"]*125
demo["revenue"]=demo["demo_age_17_"]*75

In [16]:

# -----------------------
# 2) CLEAN DATE + TYPES
# -----------------------
for df in [enroll, bio, demo]:
    df["date"] = pd.to_datetime(df["date"], dayfirst=True, errors="coerce")

enroll = enroll.dropna(subset=["date"])
bio = bio.dropna(subset=["date"])
demo = demo.dropna(subset=["date"])

# Revenue safe
bio["revenue"] = pd.to_numeric(bio.get("revenue", 0), errors="coerce").fillna(0)
demo["revenue"] = pd.to_numeric(demo.get("revenue", 0), errors="coerce").fillna(0)

# -----------------------
# 3) ENROLLMENT TOTAL
# -----------------------
if "total_enrollment" not in enroll.columns:
    enroll["total_enrollment"] = enroll[["age_0_5", "age_5_17", "age_18_greater"]].sum(axis=1)

# -----------------------
# 4) DEFINE BIO & DEMO SPLITS (Minor vs Major)
# -----------------------
# ✅ Minor = Age 5–17
# ✅ Major = Age 18+
# You already have columns like: bio_age_5_17, bio_age_17_  (similar in demo)

bio_minor_col = "bio_age_5_17"
bio_major_col = "bio_age_17_"
demo_minor_col = "demo_age_5_17"
demo_major_col = "demo_age_17_"

# Make sure missing columns do not break code
for col in [bio_minor_col, bio_major_col]:
    if col not in bio.columns:
        bio[col] = 0

for col in [demo_minor_col, demo_major_col]:
    if col not in demo.columns:
        demo[col] = 0

# -----------------------
# 5) MONTHLY AGGREGATION (STATE + DISTRICT)
# -----------------------
enroll_m = enroll.groupby(
    ["state", "district", pd.Grouper(key="date", freq="MS")], as_index=False
).agg(
    total_enroll=("total_enrollment", "sum"),
    infant_0_5=("age_0_5", "sum"),
    student_5_17=("age_5_17", "sum"),
    adult_18_plus=("age_18_greater", "sum")
)

bio_m = bio.groupby(
    ["state", "district", pd.Grouper(key="date", freq="MS")], as_index=False
).agg(
    bio_minor_updates=(bio_minor_col, "sum"),
    bio_major_updates=(bio_major_col, "sum"),
    bio_revenue=("revenue", "sum")
)

demo_m = demo.groupby(
    ["state", "district", pd.Grouper(key="date", freq="MS")], as_index=False
).agg(
    demo_minor_updates=(demo_minor_col, "sum"),
    demo_major_updates=(demo_major_col, "sum"),
    demo_revenue=("revenue", "sum")
)

# -----------------------
# 6) MERGE MONTHLY TABLE
# -----------------------
monthly_sd = (
    enroll_m.merge(bio_m, on=["state", "district", "date"], how="left")
            .merge(demo_m, on=["state", "district", "date"], how="left")
            .fillna(0)
)

# Total updates + revenue
monthly_sd["bio_total_updates"] = monthly_sd["bio_minor_updates"] + monthly_sd["bio_major_updates"]
monthly_sd["demo_total_updates"] = monthly_sd["demo_minor_updates"] + monthly_sd["demo_major_updates"]
monthly_sd["total_revenue"] = monthly_sd["bio_revenue"] + monthly_sd["demo_revenue"]

print("✅ STEP 1 DONE: Monthly dataset ready")
print(monthly_sd.head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo["revenue"] = pd.to_numeric(demo.get("revenue", 0), errors="coerce").fillna(0)


✅ STEP 1 DONE: Monthly dataset ready
                         state                  district       date  \
0  Andaman And Nicobar Islands                   Nicobar 2025-09-01   
1  Andaman And Nicobar Islands                   Nicobar 2025-10-01   
2  Andaman And Nicobar Islands                   Nicobar 2025-11-01   
3  Andaman And Nicobar Islands                   Nicobar 2025-12-01   
4  Andaman And Nicobar Islands  North And Middle Andaman 2025-09-01   

   total_enroll  infant_0_5  student_5_17  adult_18_plus  bio_minor_updates  \
0            48          42             6              0               26.0   
1            10           6             4              0               15.0   
2            12          12             0              0               27.0   
3             6           5             1              0               23.0   
4            40          38             2              0               74.0   

   bio_major_updates  bio_revenue  demo_minor_updates  demo_m

In [17]:
def smart_forecast(series, steps=3):
    series = series.astype(float)

    # ✅ force datetime index
    series.index = pd.to_datetime(series.index)

    # If no variation or too small history
    if series.nunique() <= 1 or len(series.dropna()) < 3:
        next_idx = pd.date_range(series.index.max() + pd.offsets.MonthBegin(1),
                                 periods=steps, freq="MS")
        return pd.Series([max(series.mean(), 0)] * steps, index=next_idx)

    try:
        model = ExponentialSmoothing(series, trend="add", seasonal=None).fit()
        fc = model.forecast(steps)

        # ✅ force forecast index also to monthly dates
        fc.index = pd.date_range(series.index.max() + pd.offsets.MonthBegin(1),
                                 periods=steps, freq="MS")
        return fc.clip(lower=0)

    except:
        next_idx = pd.date_range(series.index.max() + pd.offsets.MonthBegin(1),
                                 periods=steps, freq="MS")
        return pd.Series([max(series.mean(), 0)] * steps, index=next_idx)


In [18]:
forecast_rows = []

for (st, dist), grp in monthly_sd.groupby(["state", "district"]):
    grp = grp.sort_values("date").set_index("date")

    # ✅ Forecast Enrollment
    fc_total_enroll = smart_forecast(grp["total_enroll"], 3)
    fc_infant = smart_forecast(grp["infant_0_5"], 3)
    fc_student = smart_forecast(grp["student_5_17"], 3)
    fc_adult = smart_forecast(grp["adult_18_plus"], 3)

    # ✅ Forecast Biometric Updates
    fc_bio_minor = smart_forecast(grp["bio_minor_updates"], 3)
    fc_bio_major = smart_forecast(grp["bio_major_updates"], 3)
    fc_bio_total = smart_forecast(grp["bio_total_updates"], 3)

    # ✅ Forecast Demographic Updates
    fc_demo_minor = smart_forecast(grp["demo_minor_updates"], 3)
    fc_demo_major = smart_forecast(grp["demo_major_updates"], 3)
    fc_demo_total = smart_forecast(grp["demo_total_updates"], 3)

    # ✅ Forecast Revenue
    fc_bio_rev = smart_forecast(grp["bio_revenue"], 3)
    fc_demo_rev = smart_forecast(grp["demo_revenue"], 3)
    fc_total_rev = smart_forecast(grp["total_revenue"], 3)

    for dt in fc_total_enroll.index:

        # Child share %
        pred_total = max(fc_total_enroll.loc[dt], 1)
        pred_child_share = (fc_infant.loc[dt] + fc_student.loc[dt]) / pred_total

        forecast_rows.append({
            "state": st,
            "district": dist,
            "month": pd.to_datetime(dt).strftime("%Y-%m"),
        })

            # Enrollment Predictions
            "pred_total_enrollment": round(fc_total_enroll.loc[dt]),
            "pred_infant_enroll_0_5": round(fc_infant.loc[dt]),
            "pred_student_enroll_5_17": round(fc_student.loc[dt]),
            "pred_adult_enroll_18_plus": round(fc_adult.loc[dt]),
            "pred_child_share_pct": round(pred_child_share * 100, 2),

            # Biometric Predictions
            "pred_bio_total_updates": round(fc_bio_total.loc[dt]),
            "pred_bio_minor_updates_5_17": round(fc_bio_minor.loc[dt]),
            "pred_bio_major_updates_18_plus": round(fc_bio_major.loc[dt]),

            # Demographic Predictions
            "pred_demo_total_updates": round(fc_demo_total.loc[dt]),
            "pred_demo_minor_updates_5_17": round(fc_demo_minor.loc[dt]),
            "pred_demo_major_updates_18_plus": round(fc_demo_major.loc[dt]),

            # Revenue Predictions
            "pred_bio_revenue": round(fc_bio_rev.loc[dt], 2),
            "pred_demo_revenue": round(fc_demo_rev.loc[dt], 2),
            "pred_total_revenue": round(fc_total_rev.loc[dt], 2)
        })

forecast_df = pd.DataFrame(forecast_rows)

# ✅ Keep Jan–Mar 2026 only
forecast_df = forecast_df[forecast_df["month"].isin(["2026-01", "2026-02", "2026-03"])]

# =========================================================
# ✅ ADD RISK ZONES (RED ZONE / HIGH ALERT)
# =========================================================
# Rule-based, professional & explainable
# ✅ Red Zone = top 10% predicted total activity
forecast_df["pred_total_activity"] = (
    forecast_df["pred_total_enrollment"] +
    forecast_df["pred_bio_total_updates"] +
    forecast_df["pred_demo_total_updates"]
)

red_thresh = forecast_df["pred_total_activity"].quantile(0.90)
alert_thresh = forecast_df["pred_total_activity"].quantile(0.75)

def risk_tag(row):
    reasons = []

    if row["pred_total_activity"] >= red_thresh:
        risk = "Red Zone"
        reasons.append("Extreme Aadhaar load expected")
    elif row["pred_total_activity"] >= alert_thresh:
        risk = "High Alert"
        reasons.append("High Aadhaar load expected")
    else:
        risk = "Normal"

    if row["pred_child_share_pct"] > 35:
        reasons.append("High child enrollment expected")

    if row["pred_bio_major_updates_18_plus"] > row["pred_bio_minor_updates_5_17"]:
        reasons.append("Major biometric updates dominant")

    return risk, " | ".join(reasons)

forecast_df[["risk_category", "alert_reason"]] = forecast_df.apply(
    lambda r: pd.Series(risk_tag(r)), axis=1
)




  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

In [19]:
# ✅ Final export
forecast_df.to_csv("state_district_forecast_full_2026_Q1.csv", index=False)

print("✅ FINAL EXPORT READY: state_district_forecast_full_2026_Q1.csv")
print(forecast_df.head(10))

✅ FINAL EXPORT READY: state_district_forecast_full_2026_Q1.csv
                         state                  district    month  \
0  Andaman And Nicobar Islands                   Nicobar  2026-01   
1  Andaman And Nicobar Islands                   Nicobar  2026-02   
2  Andaman And Nicobar Islands                   Nicobar  2026-03   
3  Andaman And Nicobar Islands  North And Middle Andaman  2026-01   
4  Andaman And Nicobar Islands  North And Middle Andaman  2026-02   
5  Andaman And Nicobar Islands  North And Middle Andaman  2026-03   
6  Andaman And Nicobar Islands             South Andaman  2026-01   
7  Andaman And Nicobar Islands             South Andaman  2026-02   
8  Andaman And Nicobar Islands             South Andaman  2026-03   
9               Andhra Pradesh     Alluri Sitharama Raju  2026-01   

   pred_total_enrollment  pred_infant_enroll_0_5  pred_student_enroll_5_17  \
0                      0                       0                         0   
1                    

In [20]:
forecast_df[["state","district"]].nunique()

state        36
district    771
dtype: int64

In [23]:
forecast_df.columns

Index(['state', 'district', 'month', 'pred_total_enrollment',
       'pred_infant_enroll_0_5', 'pred_student_enroll_5_17',
       'pred_adult_enroll_18_plus', 'pred_child_share_pct',
       'pred_bio_total_updates', 'pred_bio_minor_updates_5_17',
       'pred_bio_major_updates_18_plus', 'pred_demo_total_updates',
       'pred_demo_minor_updates_5_17', 'pred_demo_major_updates_18_plus',
       'pred_bio_revenue', 'pred_demo_revenue', 'pred_total_revenue',
       'pred_total_activity', 'risk_category', 'alert_reason'],
      dtype='object')