In [2]:
# Core data libraries
import pandas as pd
import numpy as np

# Display settings for better inspection
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)

In [3]:
from pathlib import Path

# Base raw data directory
RAW_DATA_PATH = Path("../data/raw")

# Dataset-specific paths
ENROL_PATH = RAW_DATA_PATH / "enrolment"
BIO_PATH   = RAW_DATA_PATH / "biometric"
DEMO_PATH  = RAW_DATA_PATH / "demographic"


In [4]:
# Load all enrolment CSV files
enrol_files = list(ENROL_PATH.glob("*.csv"))

enrol_df = pd.concat(
    [pd.read_csv(file) for file in enrol_files],
    ignore_index=True
)

print("Enrolment Dataset Shape:", enrol_df.shape)
enrol_df.head()

Enrolment Dataset Shape: (1006029, 7)


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [5]:
# Load all biometric update CSV files
bio_files = list(BIO_PATH.glob("*.csv"))

bio_df = pd.concat(
    [pd.read_csv(file) for file in bio_files],
    ignore_index=True
)

print("Biometric Dataset Shape:", bio_df.shape)
bio_df.head()


Biometric Dataset Shape: (1861108, 6)


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815


In [6]:
# Load all demographic update CSV files
demo_files = list(DEMO_PATH.glob("*.csv"))

demo_df = pd.concat(
    [pd.read_csv(file) for file in demo_files],
    ignore_index=True
)

print("Demographic Dataset Shape:", demo_df.shape)
demo_df.head()

Demographic Dataset Shape: (2071700, 6)


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314
4,01-03-2025,Rajasthan,Udaipur,313801,45,785


In [7]:
# Check column names and data types
print("Enrolment Columns:", enrol_df.columns.tolist())
print("Biometric Columns:", bio_df.columns.tolist())
print("Demographic Columns:", demo_df.columns.tolist())

Enrolment Columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']
Biometric Columns: ['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']
Demographic Columns: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']


In [8]:
# Standardize date column across all datasets
for df in [enrol_df, bio_df, demo_df]:
    df["date"] = pd.to_datetime(df["date"], dayfirst=True, errors="coerce")

# Verify conversion
enrol_df["date"].head()

0   2025-03-02
1   2025-03-09
2   2025-03-09
3   2025-03-09
4   2025-03-09
Name: date, dtype: datetime64[ns]

In [9]:
# Missing value summary
print("Missing values (Enrolment):")
print(enrol_df.isna().sum())

print("\nMissing values (Biometric):")
print(bio_df.isna().sum())

print("\nMissing values (Demographic):")
print(demo_df.isna().sum())

# Duplicate check
print("\nDuplicate rows:")
print("Enrolment:", enrol_df.duplicated().sum())
print("Biometric:", bio_df.duplicated().sum())
print("Demographic:", demo_df.duplicated().sum())

Missing values (Enrolment):
date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64

Missing values (Biometric):
date            0
state           0
district        0
pincode         0
bio_age_5_17    0
bio_age_17_     0
dtype: int64

Missing values (Demographic):
date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64

Duplicate rows:
Enrolment: 22957
Biometric: 94896
Demographic: 473601


In [17]:
# Drop pincode before aggregation (non-additive field)
enrol_df_clean = enrol_df.drop(columns=["pincode"])
bio_df_clean   = bio_df.drop(columns=["pincode"])
demo_df_clean  = demo_df.drop(columns=["pincode"])

# Aggregate enrolment data (MONTH END)
enrol_agg = (
    enrol_df_clean
    .groupby(["state", "district", pd.Grouper(key="date", freq="ME")])
    .sum(numeric_only=True)
    .reset_index()
)


# Aggregate biometric updates
bio_agg = (
    bio_df_clean
    .groupby(["state", "district", pd.Grouper(key="date", freq="ME")])
    .sum(numeric_only=True)
    .reset_index()
)


# Aggregate demographic updates
demo_agg = (
    demo_df_clean
    .groupby(["state", "district", pd.Grouper(key="date", freq="ME")])
    .sum(numeric_only=True)
    .reset_index()
)




In [18]:
# Total enrolment across age groups
enrol_agg["total_enrolment"] = (
    enrol_agg["age_0_5"] +
    enrol_agg["age_5_17"] +
    enrol_agg["age_18_greater"]
)

# Total biometric updates
bio_agg["total_biometric_updates"] = (
    bio_agg["bio_age_5_17"] +
    bio_agg["bio_age_17_"]
)

# Total demographic updates
demo_agg["total_demographic_updates"] = (
    demo_agg["demo_age_5_17"] +
    demo_agg["demo_age_17_"]
)


In [19]:
# Merge enrolment with biometric updates
master_df = enrol_agg.merge(
    bio_agg[["state", "district", "date", "total_biometric_updates"]],
    on=["state", "district", "date"],
    how="left"
)

# Merge demographic updates
master_df = master_df.merge(
    demo_agg[["state", "district", "date", "total_demographic_updates"]],
    on=["state", "district", "date"],
    how="left"
)

# Replace missing updates with zero
master_df[["total_biometric_updates", "total_demographic_updates"]] = (
    master_df[["total_biometric_updates", "total_demographic_updates"]]
    .fillna(0)
)

master_df.head()


Unnamed: 0,state,district,date,age_0_5,age_5_17,age_18_greater,total_enrolment,total_biometric_updates,total_demographic_updates
0,100000,100000,2025-09-30,0,0,12,12,0.0,0.0
1,100000,100000,2025-10-31,0,1,0,1,0.0,0.0
2,100000,100000,2025-11-30,0,0,11,11,0.0,0.0
3,100000,100000,2025-12-31,0,0,194,194,0.0,2.0
4,Andaman & Nicobar Islands,Andamans,2025-09-30,23,4,0,27,317.0,162.0


In [20]:
# Output path
PROCESSED_PATH = Path("../data/processed")
PROCESSED_PATH.mkdir(exist_ok=True)

# Save master dataset
master_df.to_csv(
    PROCESSED_PATH / "master_district_month.csv",
    index=False
)

print("Processed master dataset saved successfully.")


Processed master dataset saved successfully.
