# Aadhaar Usage Metrics Pipeline
Ingest biometric, demographic, and enrolment CSV shards; build daily/state features; export compact JSON/CSV for UI graphs.

## Data Dictionary
- **Biometric**: bio_age_5_17, bio_age_17_ (successful biometric authentications)
- **Demographic**: demo_age_5_17, demo_age_17_ (successful demographic authentications)
- **Enrolment**: age_0_5, age_5_17, age_18_greater (new enrolments)
- Keys: date, state, district, pincode
- Visual targets: daily enrolments, biometric vs demographic usage, share ratios, child-share.

In [14]:
import json
from pathlib import Path
import numpy as np
import pandas as pd

pd.set_option("display.float_format", lambda v: f"{v:,.3f}")

base_dir = Path.cwd()
dataset_root = (base_dir.parent / "Dataset").resolve()
biometric_dir = dataset_root / "api_data_aadhar_biometric" / "api_data_aadhar_biometric"
demographic_dir = dataset_root / "api_data_aadhar_demographic" / "api_data_aadhar_demographic"
enrolment_dir = dataset_root / "api_data_aadhar_enrolment" / "api_data_aadhar_enrolment"

output_dir = base_dir / "processed"
ui_data_dir = base_dir.parent / "UI" / "src" / "data"
output_dir.mkdir(parents=True, exist_ok=True)
ui_data_dir.mkdir(parents=True, exist_ok=True)

biometric_dir, demographic_dir, enrolment_dir, output_dir

(WindowsPath('D:/Hackathons  & Competitions/UIDAI/Dataset/api_data_aadhar_biometric/api_data_aadhar_biometric'),
 WindowsPath('D:/Hackathons  & Competitions/UIDAI/Dataset/api_data_aadhar_demographic/api_data_aadhar_demographic'),
 WindowsPath('D:/Hackathons  & Competitions/UIDAI/Dataset/api_data_aadhar_enrolment/api_data_aadhar_enrolment'),
 WindowsPath('d:/Hackathons  & Competitions/UIDAI/Datascience/processed'))

In [15]:
def load_concat_csv(folder: Path, dtype_map: dict) -> pd.DataFrame:
    files = sorted(folder.glob("*.csv"))
    if not files:
        raise FileNotFoundError(f"No CSV files found in {folder}")
    frames = []
    for path in files:
        frame = pd.read_csv(
            path,
            dtype=dtype_map,
            parse_dates=["date"],
            dayfirst=True,
        )
        frames.append(frame)
    combined = pd.concat(frames, ignore_index=True)
    combined.sort_values("date", inplace=True)
    combined.reset_index(drop=True, inplace=True)
    return combined

bio_dtype = {"state": "category", "district": "category", "pincode": "int32", "bio_age_5_17": "int32", "bio_age_17_": "int32"}
demo_dtype = {"state": "category", "district": "category", "pincode": "int32", "demo_age_5_17": "int32", "demo_age_17_": "int32"}
enrol_dtype = {"state": "category", "district": "category", "pincode": "int32", "age_0_5": "int32", "age_5_17": "int32", "age_18_greater": "int32"}

bio = load_concat_csv(biometric_dir, bio_dtype)
demo = load_concat_csv(demographic_dir, demo_dtype)
enrol = load_concat_csv(enrolment_dir, enrol_dtype)

bio.shape, demo.shape, enrol.shape

((1861108, 6), (2071700, 6), (1006029, 7))

In [17]:
bio["bio_total"] = bio["bio_age_5_17"] + bio["bio_age_17_"]
demo["demo_total"] = demo["demo_age_5_17"] + demo["demo_age_17_"]
enrol["enrol_total"] = enrol["age_0_5"] + enrol["age_5_17"] + enrol["age_18_greater"]

group_keys = ["date", "state", "district", "pincode"]

bio_g = bio.groupby(group_keys, as_index=False)[["bio_age_5_17", "bio_age_17_", "bio_total"]].sum()
demo_g = demo.groupby(group_keys, as_index=False)[["demo_age_5_17", "demo_age_17_", "demo_total"]].sum()
enrol_g = enrol.groupby(group_keys, as_index=False)[["age_0_5", "age_5_17", "age_18_greater", "enrol_total"]].sum()

merged = enrol_g.merge(bio_g, on=group_keys, how="outer").merge(demo_g, on=group_keys, how="outer")
merged.fillna(0, inplace=True)

daily_state = merged.groupby(["date", "state"], as_index=False)[
    [
        "enrol_total","age_0_5","age_5_17","age_18_greater","bio_total","bio_age_5_17","demo_total","demo_age_5_17"
    ]
].sum()

denom = daily_state["enrol_total"].mask(daily_state["enrol_total"] == 0, np.nan)
daily_state["biometric_share"] = daily_state["bio_total"].div(denom)
daily_state["demographic_share"] = daily_state["demo_total"].div(denom)
daily_state["child_share"] = daily_state["age_0_5"].div(denom)
for share_col in ["biometric_share", "demographic_share", "child_share"]:
    daily_state[share_col] = daily_state[share_col].fillna(0.0)

bio_daily_totals = bio.groupby("date", as_index=False)[["bio_total", "bio_age_5_17", "bio_age_17_"]].sum()
demo_daily_totals = demo.groupby("date", as_index=False)[["demo_total", "demo_age_5_17", "demo_age_17_"]].sum()
enrol_daily_totals = enrol.groupby("date", as_index=False)[["enrol_total", "age_0_5", "age_5_17", "age_18_greater"]].sum()

daily_national = (
    enrol_daily_totals
    .merge(bio_daily_totals, on="date", how="outer")
    .merge(demo_daily_totals, on="date", how="outer")
    .fillna(0)
    .sort_values("date")
)

velocity_line = (
    daily_national.loc[:, ["date", "age_0_5", "bio_age_5_17"]]
    .rename(columns={"age_0_5": "enrolment_0_5", "bio_age_5_17": "biometric_5_17"})
)
velocity_line_serialized = velocity_line.assign(
    date=velocity_line["date"].dt.strftime("%Y-%m-%d")
).sort_values("date").to_dict("records")

def build_ohlcv(frame: pd.DataFrame, value_col: str) -> pd.DataFrame:
    state_daily = frame.groupby(["date", "state"], as_index=False)[value_col].sum()
    summary = state_daily.groupby("date")[value_col].agg(
        open=lambda x: x.quantile(0.25, interpolation="linear"),
        high="max",
        low="min",
        close=lambda x: x.quantile(0.75, interpolation="linear"),
        volume="sum",
    ).reset_index()
    summary.sort_values("date", inplace=True)
    numeric_cols = ["open", "high", "low", "close", "volume"]
    summary[numeric_cols] = summary[numeric_cols].round(0).astype("int64")
    summary["time"] = summary["date"].dt.strftime("%Y-%m-%d")
    return summary[["time", *numeric_cols]]

velocity_ohlcv = {
    "biometric": build_ohlcv(bio_g, "bio_total").to_dict("records"),
    "enrolment": build_ohlcv(enrol_g, "enrol_total").to_dict("records"),
    "demographic": build_ohlcv(demo_g, "demo_total").to_dict("records"),
}

top_states = (
    daily_state.groupby("state", as_index=False)[["enrol_total", "bio_total", "demo_total"]]
    .sum()
    .sort_values("enrol_total", ascending=False)
    .head(10)
)

display(daily_state.sort_values(["date", "enrol_total"], ascending=[True, False]).head())
velocity_line.tail()

Unnamed: 0,date,state,enrol_total,age_0_5,age_5_17,age_18_greater,bio_total,bio_age_5_17,demo_total,demo_age_5_17,biometric_share,demographic_share,child_share
0,2025-03-01,Andaman & Nicobar Islands,0.0,0.0,0.0,0.0,209.0,16.0,0.0,0.0,0.0,0.0,0.0
1,2025-03-01,Andaman and Nicobar Islands,0.0,0.0,0.0,0.0,2494.0,1596.0,1338.0,126.0,0.0,0.0,0.0
2,2025-03-01,Andhra Pradesh,0.0,0.0,0.0,0.0,403296.0,243777.0,513040.0,48600.0,0.0,0.0,0.0
3,2025-03-01,Arunachal Pradesh,0.0,0.0,0.0,0.0,7400.0,2953.0,7809.0,852.0,0.0,0.0,0.0
4,2025-03-01,Assam,0.0,0.0,0.0,0.0,92931.0,59101.0,202037.0,16692.0,0.0,0.0,0.0


Unnamed: 0,date,enrolment_0_5,biometric_5_17
110,2025-12-26,38632.0,210820.0
111,2025-12-27,37352.0,228096.0
112,2025-12-28,34049.0,77112.0
113,2025-12-29,48717.0,160213.0
114,2025-12-31,62596.0,0.0


In [18]:
def write_outputs(frame: pd.DataFrame, name: str) -> None:
    csv_path = output_dir / f"{name}.csv"
    json_path = output_dir / f"{name}.json"
    ui_json_path = ui_data_dir / f"{name}.json"
    frame.to_csv(csv_path, index=False)
    frame.to_json(json_path, orient="records", date_format="iso")
    frame.to_json(ui_json_path, orient="records", date_format="iso")
    print(f"Saved {csv_path}")
    print(f"Saved {json_path}")
    print(f"Mirrored to UI at {ui_json_path}")

def write_json_payload(payload, name: str) -> None:
    json_path = output_dir / f"{name}.json"
    ui_json_path = ui_data_dir / f"{name}.json"
    with json_path.open("w", encoding="utf-8") as handle:
        json.dump(payload, handle, indent=2)
    with ui_json_path.open("w", encoding="utf-8") as handle:
        json.dump(payload, handle, indent=2)
    print(f"Saved {json_path}")
    print(f"Mirrored to UI at {ui_json_path}")

write_outputs(daily_state, "aadhaar_daily_state")
write_outputs(daily_national, "aadhaar_daily_national")
write_json_payload(velocity_line_serialized, "velocityData")
write_json_payload(velocity_ohlcv, "velocityDataOHLCV")
top_states

Saved d:\Hackathons  & Competitions\UIDAI\Datascience\processed\aadhaar_daily_state.csv
Saved d:\Hackathons  & Competitions\UIDAI\Datascience\processed\aadhaar_daily_state.json
Mirrored to UI at d:\Hackathons  & Competitions\UIDAI\UI\src\data\aadhaar_daily_state.json
Saved d:\Hackathons  & Competitions\UIDAI\Datascience\processed\aadhaar_daily_national.csv
Saved d:\Hackathons  & Competitions\UIDAI\Datascience\processed\aadhaar_daily_national.json
Mirrored to UI at d:\Hackathons  & Competitions\UIDAI\UI\src\data\aadhaar_daily_national.json
Saved d:\Hackathons  & Competitions\UIDAI\Datascience\processed\velocityData.json
Mirrored to UI at d:\Hackathons  & Competitions\UIDAI\UI\src\data\velocityData.json
Saved d:\Hackathons  & Competitions\UIDAI\Datascience\processed\velocityDataOHLCV.json
Mirrored to UI at d:\Hackathons  & Competitions\UIDAI\UI\src\data\velocityDataOHLCV.json


Unnamed: 0,state,enrol_total,bio_total,demo_total
54,Uttar Pradesh,1018629.0,9577735.0,8542328.0
7,Bihar,609585.0,4897587.0,4814350.0
32,Madhya Pradesh,493970.0,5923771.0,2912938.0
61,West Bengal,375297.0,2524448.0,3872172.0
33,Maharashtra,369139.0,9226139.0,5054602.0
47,Rajasthan,348458.0,3994955.0,2817615.0
19,Gujarat,280549.0,3196514.0,1824327.0
5,Assam,230197.0,982722.0,1012578.0
27,Karnataka,223235.0,2635954.0,1695285.0
49,Tamil Nadu,220789.0,4698117.0,2212228.0
