## 1. Setup & Load

In [1]:
import pandas as pd

# Load the aggregated panel from Phase 2
panel = pd.read_csv("../data/processed/district_month_panel_duckdb.csv")

print(f"panel.shape: {panel.shape}")

panel.shape: (4355, 10)


In [2]:
panel.head()

Unnamed: 0,state,district,year_month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
0,Assam,Tinsukia,2025-03,84.0,170.0,101.0,499.0,7786.0,1135.0,846.0
1,Uttar Pradesh,Maharajganj,2025-03,54.0,181.0,47.0,2613.0,34402.0,5914.0,5062.0
2,Rajasthan,Ajmer,2025-04,1007.0,196.0,36.0,477.0,3459.0,5509.0,7665.0
3,Uttarakhand,Dehradun,2025-04,510.0,138.0,26.0,492.0,4348.0,5431.0,7762.0
4,Uttar Pradesh,Rampur,2025-04,894.0,333.0,19.0,756.0,5993.0,15673.0,4784.0


In [3]:
panel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4355 entries, 0 to 4354
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state           4355 non-null   object 
 1   district        4355 non-null   object 
 2   year_month      4355 non-null   object 
 3   age_0_5         4355 non-null   float64
 4   age_5_17        4355 non-null   float64
 5   age_18_greater  4355 non-null   float64
 6   demo_age_5_17   4355 non-null   float64
 7   demo_age_17_    4355 non-null   float64
 8   bio_age_5_17    4355 non-null   float64
 9   bio_age_17_     4355 non-null   float64
dtypes: float64(7), object(3)
memory usage: 340.4+ KB


In [4]:
panel.describe(include="all")

Unnamed: 0,state,district,year_month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
count,4355,4355,4355,4355.0,4355.0,4355.0,4355.0,4355.0,4355.0,4355.0
unique,52,939,9,,,,,,,
top,Uttar Pradesh,Hooghly,2025-09,,,,,,,
freq,410,16,988,,,,,,,
mean,,,,739.039724,313.619288,21.91504,810.905855,7450.232147,4162.344432,4047.305626
std,,,,882.94767,527.645178,58.595972,1016.900896,10568.452892,5303.466285,5495.415491
min,,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,,,,102.0,21.0,0.0,108.0,1014.0,348.5,413.5
50%,,,,474.0,101.0,4.0,465.0,3944.0,2376.0,2285.0
75%,,,,1064.0,356.5,20.0,1089.5,9288.5,5739.0,5614.5


---
## 2. Data Quality Checks

In [5]:
# Missing value counts
missing = panel.isna().sum().sort_values(ascending=False)
print("Missing values per column:")
print(missing)

Missing values per column:
state             0
district          0
year_month        0
age_0_5           0
age_5_17          0
age_18_greater    0
demo_age_5_17     0
demo_age_17_      0
bio_age_5_17      0
bio_age_17_       0
dtype: int64


In [6]:
# Unique counts for key columns
n_states = panel["state"].nunique()
n_districts = panel["district"].nunique()
n_months = panel["year_month"].nunique()

print(f"Unique states: {n_states}")
print(f"Unique districts: {n_districts}")
print(f"Unique year_month values: {n_months}")

Unique states: 52
Unique districts: 939
Unique year_month values: 9


In [7]:
# List all unique year_month values (chronological)
print("Year-month values in panel:")
print(sorted(panel["year_month"].unique()))

Year-month values in panel:
['2025-03', '2025-04', '2025-05', '2025-06', '2025-07', '2025-09', '2025-10', '2025-11', '2025-12']


### Data Quality Summary

- **Total rows:** Check `panel.shape[0]` output above.
- **States:** Number of unique states/UTs in the panel.
- **Districts:** Number of unique district names.
- **Months:** Number of distinct year-month periods covered.
- **Missing values:** Review the missing counts above; ideally all zeros for a clean panel.

---
## 3. Aggregate Features

Create total columns for enrolment, demographic updates, and biometric updates.

In [8]:
# Create aggregate columns
panel["total_enrolment"] = panel["age_0_5"] + panel["age_5_17"] + panel["age_18_greater"]
panel["total_demo_updates"] = panel["demo_age_5_17"] + panel["demo_age_17_"]
panel["total_bio_updates"] = panel["bio_age_5_17"] + panel["bio_age_17_"]

print("New columns added: total_enrolment, total_demo_updates, total_bio_updates")
print(f"\nUpdated panel.shape: {panel.shape}")

New columns added: total_enrolment, total_demo_updates, total_bio_updates

Updated panel.shape: (4355, 13)


In [9]:
# Descriptive stats for the new aggregate columns
panel[["total_enrolment", "total_demo_updates", "total_bio_updates"]].describe()

Unnamed: 0,total_enrolment,total_demo_updates,total_bio_updates
count,4355.0,4355.0,4355.0
mean,1074.574053,8261.138002,8209.650057
std,1306.649847,11370.577092,10100.168003
min,1.0,1.0,1.0
25%,156.0,1168.0,902.0
50%,636.0,4500.0,4866.0
75%,1523.0,10372.0,11731.5
max,13877.0,133650.0,89018.0


In [10]:
# Sanity check: top 5 rows by total_enrolment
print("Top 5 district-months by total_enrolment:")
panel.nlargest(5, "total_enrolment")[["state", "district", "year_month", "total_enrolment", "total_demo_updates", "total_bio_updates"]]

Top 5 district-months by total_enrolment:


Unnamed: 0,state,district,year_month,total_enrolment,total_demo_updates,total_bio_updates
3073,West Bengal,Murshidabad,2025-09,13877.0,75748.0,24559.0
1721,West Bengal,North 24 Parganas,2025-09,10642.0,59799.0,24381.0
2065,West Bengal,South 24 Parganas,2025-10,9838.0,43610.0,19421.0
585,West Bengal,South 24 Parganas,2025-09,9459.0,73008.0,22167.0
2264,Rajasthan,Jaipur,2025-09,9131.0,25948.0,49314.0


---
## 4. EDA by State

Aggregate metrics at the state level to see which states have the highest activity.

In [11]:
# State-level summary
state_summary = (
    panel
    .groupby("state", as_index=False)
    .agg(
        total_enrolment_sum=("total_enrolment", "sum"),
        total_enrolment_mean=("total_enrolment", "mean"),
        total_demo_updates_sum=("total_demo_updates", "sum"),
        total_demo_updates_mean=("total_demo_updates", "mean"),
        total_bio_updates_sum=("total_bio_updates", "sum"),
        total_bio_updates_mean=("total_bio_updates", "mean"),
        n_district_months=("district", "count")
    )
    .sort_values("total_enrolment_sum", ascending=False)
)

print(f"state_summary.shape: {state_summary.shape}")

state_summary.shape: (52, 8)


In [12]:
# Top 10 states by total enrolment
state_summary.head(10)

Unnamed: 0,state,total_enrolment_sum,total_enrolment_mean,total_demo_updates_sum,total_demo_updates_mean,total_bio_updates_sum,total_bio_updates_mean,n_district_months
42,Uttar Pradesh,831059.0,2026.973171,6736936.0,16431.55122,5281356.0,12881.356098,410
25,Madhya Pradesh,476233.0,1531.29582,1778131.0,5717.463023,3166579.0,10181.926045,311
5,Bihar,449304.0,2316.0,3930130.0,20258.402062,1887270.0,9728.195876,194
26,Maharashtra,346115.0,1401.275304,4369950.0,17692.105263,5936040.0,24032.550607,247
37,Rajasthan,340405.0,1702.025,1520149.0,7600.745,2632436.0,13162.18,200
48,West Bengal,334071.0,1845.696133,3029070.0,16735.19337,1145529.0,6328.889503,181
39,Tamil Nadu,217605.0,1265.145349,1610344.0,9362.465116,2088349.0,12141.563953,172
15,Gujarat,205176.0,1057.608247,1371430.0,7069.226804,1739846.0,8968.278351,194
21,Karnataka,196599.0,910.180556,1032251.0,4778.939815,1098439.0,5085.365741,216
4,Assam,161897.0,909.533708,930169.0,5225.668539,632148.0,3551.393258,178


### State-Level Interpretation

- **Top states by enrolment:** The states with the highest `total_enrolment_sum` are likely the most populous or have the most active Aadhaar enrolment drives.
- **Demographic vs biometric updates:** Compare `total_demo_updates_sum` and `total_bio_updates_sum` to see if some states have higher update activity relative to enrolments.
- **Mean values:** The `_mean` columns show per-district-month averages, helping identify states with high activity density vs. spread across many districts.
- **District-month counts:** `n_district_months` indicates how many data points each state contributes to the panel.

---
## 5. EDA Over Time (year_month)

Aggregate metrics by month to see trends over time.

In [13]:
# Month-level summary
month_summary = (
    panel
    .groupby("year_month", as_index=False)
    .agg(
        total_enrolment_sum=("total_enrolment", "sum"),
        total_demo_updates_sum=("total_demo_updates", "sum"),
        total_bio_updates_sum=("total_bio_updates", "sum"),
        n_districts=("district", "count")
    )
    .sort_values("year_month")
)

print(f"month_summary.shape: {month_summary.shape}")

month_summary.shape: (9, 5)


In [14]:
# Show full month summary (should be small)
month_summary

Unnamed: 0,year_month,total_enrolment_sum,total_demo_updates_sum,total_bio_updates_sum,n_districts
0,2025-03,14261.0,1443338.0,932719.0,64
1,2025-04,99330.0,919969.0,1992391.0,102
2,2025-05,80997.0,727750.0,1509987.0,76
3,2025-06,56556.0,526541.0,1082369.0,60
4,2025-07,283528.0,1204489.0,3009125.0,129
5,2025-09,1475846.0,7323807.0,6654804.0,988
6,2025-10,817825.0,5010386.0,4582486.0,973
7,2025-11,1091974.0,9386639.0,7285329.0,982
8,2025-12,759453.0,9434337.0,8703816.0,981


### Time-Series Interpretation

- **Trend direction:** Look at whether `total_enrolment_sum` is increasing, decreasing, or stable over months.
- **Spikes or dips:** Any month with unusually high or low values may indicate data quality issues, seasonal patterns, or special events.
- **Coverage consistency:** The `n_districts` column shows how many district-months are in each period; large variations could indicate incomplete data for some months.

---
## 6. State × Time Patterns

Identify which state-month combinations have the highest activity.

In [15]:
# State × month summary
state_month_summary = (
    panel
    .groupby(["state", "year_month"], as_index=False)
    .agg(
        total_enrolment_sum=("total_enrolment", "sum"),
        total_demo_updates_sum=("total_demo_updates", "sum"),
        total_bio_updates_sum=("total_bio_updates", "sum"),
        n_districts=("district", "count")
    )
    .sort_values("total_enrolment_sum", ascending=False)
)

print(f"state_month_summary.shape: {state_month_summary.shape}")

state_month_summary.shape: (278, 6)


In [16]:
# Top 20 state-month combinations by total enrolment
state_month_summary.head(20)

Unnamed: 0,state,year_month,total_enrolment_sum,total_demo_updates_sum,total_bio_updates_sum,n_districts
237,Uttar Pradesh,2025-09,261079.0,1400222.0,1026524.0,85
239,Uttar Pradesh,2025-11,182695.0,1523686.0,1097873.0,85
29,Bihar,2025-09,143565.0,1218096.0,372101.0,47
238,Uttar Pradesh,2025-10,138152.0,731752.0,576876.0,84
142,Madhya Pradesh,2025-09,132517.0,324691.0,474497.0,60
205,Rajasthan,2025-09,123584.0,293200.0,690954.0,37
264,West Bengal,2025-09,119636.0,844337.0,282629.0,46
240,Uttar Pradesh,2025-12,116927.0,1848703.0,1156161.0,85
31,Bihar,2025-11,105624.0,892828.0,404966.0,47
144,Madhya Pradesh,2025-11,103539.0,321843.0,398519.0,60


---
## 7. Phase 3.2 – Target Definition (Next Phase)

This section outlines possible target variables for modeling. **No models are built here.**

### Potential Target Options

1. **Regression: Next-month total enrolment per district**
   - Predict `total_enrolment` for month $t+1$ given features from month $t$.
   - Requires creating lag features and shifting the target.

2. **Classification: Activity level (high / medium / low)**
   - Bin `total_enrolment` into categories based on quantiles or domain thresholds.
   - Predict which category a district-month will fall into.

3. **Regression: Demographic update volume**
   - Predict `total_demo_updates` based on enrolment history and district characteristics.

4. **Anomaly detection: Unusual activity spikes**
   - Flag district-months where enrolment or updates deviate significantly from historical norms.

---

*Next step: Choose one target, engineer features, and prepare train/test splits in Phase 3.2.*