---
## 1. Setup

In [1]:
import pandas as pd
from pathlib import Path

RAW_DIR = Path("../data/raw")

In [2]:
# Debug: confirm RAW_DIR resolves correctly and list CSV files
print("RAW_DIR resolved:", RAW_DIR.resolve())
print("\nCSV files found:")
for f in sorted(RAW_DIR.glob("*.csv")):
    print(f"  {f.name}")

RAW_DIR resolved: C:\MyProjects\uidai-asris\data\raw

CSV files found:
  api_data_aadhar_biometric_1.csv
  api_data_aadhar_biometric_2.csv
  api_data_aadhar_biometric_3.csv
  api_data_aadhar_biometric_4.csv
  api_data_aadhar_demographic_1.csv
  api_data_aadhar_demographic_2.csv
  api_data_aadhar_demographic_3.csv
  api_data_aadhar_demographic_4.csv
  api_data_aadhar_demographic_5.csv
  api_data_aadhar_enrolment_1.csv
  api_data_aadhar_enrolment_2.csv
  api_data_aadhar_enrolment_3.csv


---
## 2. Concatenate All Files Per Dataset

Read all matching CSVs for each dataset type and concatenate into one raw dataframe.

In [3]:
# Enrolment: read all api_data_aadhar_enrolment_*.csv and concatenate
enrol_files = sorted(RAW_DIR.glob("api_data_aadhar_enrolment_*.csv"))
enrol_raw_all = pd.concat([pd.read_csv(f) for f in enrol_files], ignore_index=True)

print(f"Enrolment: {len(enrol_files)} files loaded")
print(f"enrol_raw_all.shape: {enrol_raw_all.shape}")
enrol_raw_all.head(3)

Enrolment: 3 files loaded
enrol_raw_all.shape: (1006029, 7)


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12


In [4]:
# Demographic: read all api_data_aadhar_demographic_*.csv and concatenate
demo_files = sorted(RAW_DIR.glob("api_data_aadhar_demographic_*.csv"))
demo_raw_all = pd.concat([pd.read_csv(f) for f in demo_files], ignore_index=True)

print(f"Demographic: {len(demo_files)} files loaded")
print(f"demo_raw_all.shape: {demo_raw_all.shape}")
demo_raw_all.head(3)

Demographic: 5 files loaded
demo_raw_all.shape: (2071700, 6)


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765


In [5]:
# Biometric: read all api_data_aadhar_biometric_*.csv and concatenate
bio_files = sorted(RAW_DIR.glob("api_data_aadhar_biometric_*.csv"))
bio_raw_all = pd.concat([pd.read_csv(f) for f in bio_files], ignore_index=True)

print(f"Biometric: {len(bio_files)} files loaded")
print(f"bio_raw_all.shape: {bio_raw_all.shape}")
bio_raw_all.head(3)

Biometric: 4 files loaded
bio_raw_all.shape: (1861108, 6)


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091


---
## 3. Clean Dates & Create `year_month`

Convert `date` column from `dd-mm-yyyy` to datetime, create `year_month` as `"YYYY-MM"`, and keep only needed columns.

In [6]:
# Enrolment: parse date, create year_month, select columns
enrol_raw_all["date"] = pd.to_datetime(enrol_raw_all["date"], format="%d-%m-%Y")
enrol_raw_all["year_month"] = enrol_raw_all["date"].dt.to_period("M").astype(str)

enrol_clean = enrol_raw_all[["state", "district", "year_month", "age_0_5", "age_5_17", "age_18_greater"]].copy()

print("enrol_clean columns:", enrol_clean.columns.tolist())
enrol_clean.head(3)

enrol_clean columns: ['state', 'district', 'year_month', 'age_0_5', 'age_5_17', 'age_18_greater']


Unnamed: 0,state,district,year_month,age_0_5,age_5_17,age_18_greater
0,Meghalaya,East Khasi Hills,2025-03,11,61,37
1,Karnataka,Bengaluru Urban,2025-03,14,33,39
2,Uttar Pradesh,Kanpur Nagar,2025-03,29,82,12


In [7]:
# Demographic: parse date, create year_month, select columns
demo_raw_all["date"] = pd.to_datetime(demo_raw_all["date"], format="%d-%m-%Y")
demo_raw_all["year_month"] = demo_raw_all["date"].dt.to_period("M").astype(str)

demo_clean = demo_raw_all[["state", "district", "year_month", "demo_age_5_17", "demo_age_17_"]].copy()

print("demo_clean columns:", demo_clean.columns.tolist())
demo_clean.head(3)

demo_clean columns: ['state', 'district', 'year_month', 'demo_age_5_17', 'demo_age_17_']


Unnamed: 0,state,district,year_month,demo_age_5_17,demo_age_17_
0,Uttar Pradesh,Gorakhpur,2025-03,49,529
1,Andhra Pradesh,Chittoor,2025-03,22,375
2,Gujarat,Rajkot,2025-03,65,765


In [8]:
# Biometric: parse date, create year_month, select columns
bio_raw_all["date"] = pd.to_datetime(bio_raw_all["date"], format="%d-%m-%Y")
bio_raw_all["year_month"] = bio_raw_all["date"].dt.to_period("M").astype(str)

bio_clean = bio_raw_all[["state", "district", "year_month", "bio_age_5_17", "bio_age_17_"]].copy()

print("bio_clean columns:", bio_clean.columns.tolist())
bio_clean.head(3)

bio_clean columns: ['state', 'district', 'year_month', 'bio_age_5_17', 'bio_age_17_']


Unnamed: 0,state,district,year_month,bio_age_5_17,bio_age_17_
0,Haryana,Mahendragarh,2025-03,280,577
1,Bihar,Madhepura,2025-03,144,369
2,Jammu and Kashmir,Punch,2025-03,643,1091


---
## 4. Aggregate to District × Month

Group by `[state, district, year_month]` and sum the count columns for each dataset.

In [9]:
# Enrolment aggregation: sum age bands by state, district, year_month
enrol_agg = (
    enrol_clean
    .groupby(["state", "district", "year_month"], as_index=False)
    .agg({"age_0_5": "sum", "age_5_17": "sum", "age_18_greater": "sum"})
)

print(f"enrol_agg.shape: {enrol_agg.shape}")
enrol_agg.head(3)

enrol_agg.shape: (5062, 6)


Unnamed: 0,state,district,year_month,age_0_5,age_5_17,age_18_greater
0,100000,100000,2025-09,0,0,12
1,100000,100000,2025-10,0,1,0
2,100000,100000,2025-11,0,0,11


In [10]:
# Demographic aggregation: sum age bands by state, district, year_month
demo_agg = (
    demo_clean
    .groupby(["state", "district", "year_month"], as_index=False)
    .agg({"demo_age_5_17": "sum", "demo_age_17_": "sum"})
)

print(f"demo_agg.shape: {demo_agg.shape}")
demo_agg.head(3)

demo_agg.shape: (6072, 5)


Unnamed: 0,state,district,year_month,demo_age_5_17,demo_age_17_
0,100000,100000,2025-12,0,2
1,Andaman & Nicobar Islands,Andamans,2025-09,3,159
2,Andaman & Nicobar Islands,Andamans,2025-10,2,73


In [11]:
# Biometric aggregation: sum age bands by state, district, year_month
bio_agg = (
    bio_clean
    .groupby(["state", "district", "year_month"], as_index=False)
    .agg({"bio_age_5_17": "sum", "bio_age_17_": "sum"})
)

print(f"bio_agg.shape: {bio_agg.shape}")
bio_agg.head(3)

bio_agg.shape: (8507, 5)


Unnamed: 0,state,district,year_month,bio_age_5_17,bio_age_17_
0,Andaman & Nicobar Islands,Andamans,2025-03,16,193
1,Andaman & Nicobar Islands,Andamans,2025-04,17,167
2,Andaman & Nicobar Islands,Andamans,2025-05,22,158


---
## 5. Join Into One Wide Panel

Merge `enrol_agg`, `demo_agg`, and `bio_agg` on `[state, district, year_month]` using inner joins.

In [12]:
# Merge enrol_agg + demo_agg
merge_keys = ["state", "district", "year_month"]

district_month_panel = enrol_agg.merge(demo_agg, on=merge_keys, how="inner")

# Merge result + bio_agg
district_month_panel = district_month_panel.merge(bio_agg, on=merge_keys, how="inner")

print(f"district_month_panel.shape: {district_month_panel.shape}")
print(f"Columns: {district_month_panel.columns.tolist()}")
district_month_panel.head(5)

district_month_panel.shape: (4355, 10)
Columns: ['state', 'district', 'year_month', 'age_0_5', 'age_5_17', 'age_18_greater', 'demo_age_5_17', 'demo_age_17_', 'bio_age_5_17', 'bio_age_17_']


Unnamed: 0,state,district,year_month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
0,Andaman & Nicobar Islands,Andamans,2025-09,23,4,0,3,159,76,241
1,Andaman & Nicobar Islands,Andamans,2025-10,15,0,0,2,73,43,139
2,Andaman & Nicobar Islands,Andamans,2025-11,13,0,0,0,212,48,174
3,Andaman & Nicobar Islands,Andamans,2025-12,19,1,0,2,299,90,232
4,Andaman & Nicobar Islands,South Andaman,2025-09,15,0,0,3,69,24,87


In [13]:
# Quick sanity check: show info and check for nulls
district_month_panel.info()
print("\nNull counts:")
print(district_month_panel.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4355 entries, 0 to 4354
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   state           4355 non-null   object
 1   district        4355 non-null   object
 2   year_month      4355 non-null   object
 3   age_0_5         4355 non-null   int64 
 4   age_5_17        4355 non-null   int64 
 5   age_18_greater  4355 non-null   int64 
 6   demo_age_5_17   4355 non-null   int64 
 7   demo_age_17_    4355 non-null   int64 
 8   bio_age_5_17    4355 non-null   int64 
 9   bio_age_17_     4355 non-null   int64 
dtypes: int64(7), object(3)
memory usage: 340.4+ KB

Null counts:
state             0
district          0
year_month        0
age_0_5           0
age_5_17          0
age_18_greater    0
demo_age_5_17     0
demo_age_17_      0
bio_age_5_17      0
bio_age_17_       0
dtype: int64


In [14]:
# Save final panel to CSV
from pathlib import Path

PROCESSED_DIR = Path("../data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

output_path = PROCESSED_DIR / "district_month_panel.csv"
district_month_panel.to_csv(output_path, index=False)

print(f"Saved to: {output_path.resolve()}")

Saved to: C:\MyProjects\uidai-asris\data\processed\district_month_panel.csv


---
## Phase 2 Summary

**What was done in this notebook:**

- **Files read and concatenated:**
  - Enrolment: `api_data_aadhar_enrolment_*.csv` → `enrol_raw_all`
  - Demographic: `api_data_aadhar_demographic_*.csv` → `demo_raw_all`
  - Biometric: `api_data_aadhar_biometric_*.csv` → `bio_raw_all`

- **`year_month` created:**
  - Parsed `date` column from `dd-mm-yyyy` format
  - Derived `year_month` as `"YYYY-MM"` string

- **Aggregations at district × month level:**
  - `enrol_agg`: sums of `age_0_5`, `age_5_17`, `age_18_greater`
  - `demo_agg`: sums of `demo_age_5_17`, `demo_age_17_`
  - `bio_agg`: sums of `bio_age_5_17`, `bio_age_17_`

- **Final wide panel prepared:**
  - `district_month_panel` = inner join of all three aggregates on `[state, district, year_month]`
  - Contains 10 columns: 3 keys + 3 enrolment + 2 demographic + 2 biometric

---

*Next phase: Feature engineering, EDA, or modeling.*