In [1]:
import pandas as pd

In [2]:
# Debug helper: print working directory and list files in data/raw
import os
from pathlib import Path

print("Current working directory:", os.getcwd())
print("\nFiles in ../data/raw/:")
raw_path = Path("../data/raw")
if raw_path.exists():
    for f in sorted(raw_path.iterdir()):
        print(f"  {f.name}")
else:
    print("  WARNING: ../data/raw/ does not exist!")

Current working directory: c:\MyProjects\uidai-asris\notebooks

Files in ../data/raw/:
  api_data_aadhar_biometric_1.csv
  api_data_aadhar_biometric_2.csv
  api_data_aadhar_biometric_3.csv
  api_data_aadhar_biometric_4.csv
  api_data_aadhar_demographic_1.csv
  api_data_aadhar_demographic_2.csv
  api_data_aadhar_demographic_3.csv
  api_data_aadhar_demographic_4.csv
  api_data_aadhar_demographic_5.csv
  api_data_aadhar_enrolment_1.csv
  api_data_aadhar_enrolment_2.csv
  api_data_aadhar_enrolment_3.csv


## 1. Enrolment Data

In [3]:
# Read first 5,000 rows from api_data_aadhar_enrolment_1.csv
df_enrolment = pd.read_csv("../data/raw/api_data_aadhar_enrolment_1.csv", nrows=5000)

In [4]:
df_enrolment.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [5]:
df_enrolment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   date            5000 non-null   object
 1   state           5000 non-null   object
 2   district        5000 non-null   object
 3   pincode         5000 non-null   int64 
 4   age_0_5         5000 non-null   int64 
 5   age_5_17        5000 non-null   int64 
 6   age_18_greater  5000 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 273.6+ KB


In [6]:
df_enrolment.columns.tolist()

['date',
 'state',
 'district',
 'pincode',
 'age_0_5',
 'age_5_17',
 'age_18_greater']

### ‚úèÔ∏è Enrolment Schema Notes

```
Enrolment schema notes:
state_col = "state"
district_col = "district"
date_col = "date"
count_cols = ["age_0_5", "age_5_17", "age_18_greater"]
age_band_cols = ["age_0_5", "age_5_17", "age_18_greater"]
```

---
## 2. Demographic Update Data

In [7]:
# Read first 5,000 rows from api_data_aadhar_demographic_1.csv
df_demo = pd.read_csv("../data/raw/api_data_aadhar_demographic_1.csv", nrows=5000)

In [8]:
df_demo.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314
4,01-03-2025,Rajasthan,Udaipur,313801,45,785


In [9]:
df_demo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   date           5000 non-null   object
 1   state          5000 non-null   object
 2   district       5000 non-null   object
 3   pincode        5000 non-null   int64 
 4   demo_age_5_17  5000 non-null   int64 
 5   demo_age_17_   5000 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 234.5+ KB


In [10]:
df_demo.columns.tolist()

['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

### ‚úèÔ∏è Demographic Schema Notes

```
Demographic schema notes:
state_col = "state"
district_col = "district"
date_col = "date"
count_cols = ["demo_age_5_17", "demo_age_17_"]
age_band_cols = ["demo_age_5_17", "demo_age_17_"]
```

---
## 3. Biometric Update Data

In [11]:
# Read first 5,000 rows from api_data_aadhar_biometric_1.csv
df_biometric = pd.read_csv("../data/raw/api_data_aadhar_biometric_1.csv", nrows=5000)

In [12]:
df_biometric.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815


In [13]:
df_biometric.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          5000 non-null   object
 1   state         5000 non-null   object
 2   district      5000 non-null   object
 3   pincode       5000 non-null   int64 
 4   bio_age_5_17  5000 non-null   int64 
 5   bio_age_17_   5000 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 234.5+ KB


In [14]:
df_biometric.columns.tolist()

['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']

### ‚úèÔ∏è Biometric Schema Notes

```
Biometric schema notes:
state_col = "state"
district_col = "district"
date_col = "date"
count_cols = ["bio_age_5_17", "bio_age_17_"]
age_band_cols = ["bio_age_5_17", "bio_age_17_"]
```

---
## 4. Schema Consistency Check (All Files)

**Purpose:** Visually confirm that all CSV files of the same type share the same schema (columns).  
If any file has different columns, fix it before proceeding to aggregation.

In [15]:
# Schema consistency check: read 1 row from each file and print columns.
# Visually confirm all files of the same type have identical columns.
# If any file differs, fix it before aggregation.

from pathlib import Path

raw_dir = Path("../data/raw")

# Glob patterns for each dataset type
enrol_files = sorted(raw_dir.glob("api_data_aadhar_enrolment_*.csv"))
demo_files = sorted(raw_dir.glob("api_data_aadhar_demographic_*.csv"))
bio_files = sorted(raw_dir.glob("api_data_aadhar_biometric_*.csv"))

print("=" * 60)
print("ENROLMENT FILES")
print("=" * 60)
for f in enrol_files:
    cols = pd.read_csv(f, nrows=1).columns.tolist()
    print(f"{f.name}: {cols}\n")

print("=" * 60)
print("DEMOGRAPHIC UPDATE FILES")
print("=" * 60)
for f in demo_files:
    cols = pd.read_csv(f, nrows=1).columns.tolist()
    print(f"{f.name}: {cols}\n")

print("=" * 60)
print("BIOMETRIC UPDATE FILES")
print("=" * 60)
for f in bio_files:
    cols = pd.read_csv(f, nrows=1).columns.tolist()
    print(f"{f.name}: {cols}\n")

ENROLMENT FILES
api_data_aadhar_enrolment_1.csv: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

api_data_aadhar_enrolment_2.csv: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

api_data_aadhar_enrolment_3.csv: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

DEMOGRAPHIC UPDATE FILES
api_data_aadhar_demographic_1.csv: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

api_data_aadhar_demographic_2.csv: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

api_data_aadhar_demographic_3.csv: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

api_data_aadhar_demographic_4.csv: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

api_data_aadhar_demographic_5.csv: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

BIOMETRIC UPDATE FILES
api_data_aadhar_biometric_1.csv: ['dat

---
## üìã Summary & Next Steps

### Key Reminders:

1. **Note the exact column names** for:
   - State
   - District
   - PIN code (if present)
   - Date/period

2. **Identify columns containing:**
   - Age bands (e.g., 0-5, 5-18, etc.)
   - Counts/totals

3. **Future aggregation unit:** `district √ó month`
   - All downstream processing will aggregate data at this level
   - Ensure date column can be parsed to extract year-month

---

*Once you have filled in the notes above, proceed to the next phase: data cleaning and aggregation.*