# Puducherry Analysis - Digital Equity Index (DEI)

This notebook performs analysis of Aadhaar enrollment and update data for **Puducherry (UT)**.

## Official Districts (4)
Puducherry, Karaikal, Mahe, Yanam

## Data Sources
Uses **Unified/Merged** datasets (duplicates removed from `puducherry` & `pondi` sets):
- `puducherry_unified_enrollment.csv`
- `puducherry_unified_demographic.csv`
- `puducherry_unified_biometric.csv`

In [18]:
import numpy as np
import pandas as pd
import os

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

## 1. Data Loading (Unified)

In [19]:
BASE_PATH = r'c:\Users\Admin\OneDrive\Documents\AADHAR Hackathon\puducherry'
DATA_PATH = DATA_PATH = os.path.join(BASE_PATH, 'data')

# Unified files are in the main directory

enrol_df = pd.read_csv(os.path.join(DATA_PATH, 'puducherry_unified_enrollment.csv'))
demo_df = pd.read_csv(os.path.join(DATA_PATH, 'puducherry_unified_demographic.csv'))
bio_df = pd.read_csv(os.path.join(DATA_PATH, 'puducherry_unified_biometric.csv'))

print(f'Enrollment: {len(enrol_df):,} | Demographic: {len(demo_df):,} | Biometric: {len(bio_df):,}')

Enrollment: 2,062 | Demographic: 5,540 | Biometric: 13,559


## 2. Data Cleaning

In [20]:
# Official 4 Districts
OFFICIAL_DISTRICTS = {
    'puducherry', 
    'karaikal', 
    'mahe', 
    'yanam'
}

DISTRICT_CLEANUP_MAP = {
    'pondicherry': 'puducherry',
    'pondy': 'puducherry'
    # 'viluppuram' is explicitly DROPPED as it belongs to TN
}

def clean_district_name(name):
    if pd.isna(name):
        return None
    
    # Lowercase and strip
    cleaned = str(name).strip().lower()
    
    # Remove asterisk
    if cleaned.endswith(' *'):
        cleaned = cleaned[:-2].strip()
    if cleaned.endswith('*'):
        cleaned = cleaned[:-1].strip()
    
    # Apply mapping
    if cleaned in DISTRICT_CLEANUP_MAP:
        cleaned = DISTRICT_CLEANUP_MAP[cleaned]
        
    return cleaned

def clean_state_name(name):
    return 'Puducherry'

print(f'Official districts: {len(OFFICIAL_DISTRICTS)}')

Official districts: 4


In [21]:
# Apply cleaning and VALIDATE check
for df in [enrol_df, demo_df, bio_df]:
    # 1. Clean District
    df['district_clean'] = df['district'].apply(clean_district_name)
    
    # 2. Clean State (Ensure uniformity)
    df['state'] = df['state'].apply(clean_state_name)
    
    # 3. Check for unknowns
    # We explicitly FILTER OUT anything not in OFFICIAL_DISTRICTS (like Viluppuram)
    dropped = df[~df['district_clean'].isin(OFFICIAL_DISTRICTS)]['district'].unique()
    if len(dropped) > 0:
        print(f'⚠️ DROPPING the following non-official districts:\n{dropped}')
            
    # 4. Apply Strict Filter
    # Drop NaNs AND non-official districts
    df.dropna(subset=['district_clean'], inplace=True)
    df.drop(df[~df['district_clean'].isin(OFFICIAL_DISTRICTS)].index, inplace=True)
    
    df['district'] = df['district_clean']
    df.drop(columns=['district_clean'], inplace=True)

    # 5. Dates
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)
    df['month'] = df['date'].dt.month

# Verify
all_cleaned = set(enrol_df['district'].unique()) | set(demo_df['district'].unique()) | set(bio_df['district'].unique())
print(f'\nFinal Districts ({len(all_cleaned)}):')
print(sorted(all_cleaned))

if len(all_cleaned) != 4:
    print(f'⚠️ Warning: Expected 4 districts, found {len(all_cleaned)}.')
else:
    print('✅ Exactly 4 districts found!')

⚠️ DROPPING the following non-official districts:
['Cuddalore']
⚠️ DROPPING the following non-official districts:
['Viluppuram' 'Cuddalore']

Final Districts (3):
['karaikal', 'puducherry', 'yanam']


## 3. Aggregation & Metrics

In [22]:
# Aggregate
enrol_agg = enrol_df.groupby(['state', 'district', 'month'])[['age_0_5', 'age_5_17', 'age_18_greater']].sum().reset_index()
demo_agg = demo_df.groupby(['state', 'district', 'month'])[['demo_age_5_17', 'demo_age_17_']].sum().reset_index()
bio_agg = bio_df.groupby(['state', 'district', 'month'])[['bio_age_5_17', 'bio_age_17_']].sum().reset_index()

combined_df = enrol_agg.merge(demo_agg, on=['state', 'district', 'month'], how='outer') \
                       .merge(bio_agg, on=['state', 'district', 'month'], how='outer')
combined_df.fillna(0, inplace=True)

# Core metrics
combined_df['E'] = combined_df['age_0_5'] + combined_df['age_5_17'] + combined_df['age_18_greater']
combined_df['DU'] = combined_df['demo_age_5_17'] + combined_df['demo_age_17_']
combined_df['BU'] = combined_df['bio_age_5_17'] + combined_df['bio_age_17_']
combined_df['U'] = combined_df['DU'] + combined_df['BU']
combined_df['T'] = combined_df['E'] + combined_df['U']

print(f'Combined records: {len(combined_df)}')

Combined records: 31


In [23]:
# District-level aggregation
district_df = combined_df.groupby(['state', 'district']).agg(
    total_months=('month', 'count'),
    active_months=('T', lambda x: (x > 0).sum()),
    total_E=('E', 'sum'), total_DU=('DU', 'sum'), total_BU=('BU', 'sum'),
    total_U=('U', 'sum'), total_T=('T', 'sum'),
    avg_monthly_enrolment=('E', 'mean'),
    monthly_volatility=('T', lambda x: x.std(ddof=0) / x.mean() if x.mean() > 0 else 0),
    peak_load_ratio=('T', lambda x: x.max() / x.mean() if x.mean() > 0 else 0),
    sum_age_0_5=('age_0_5', 'sum'), sum_age_5_17=('age_5_17', 'sum')
).reset_index()

district_df['zero_months'] = district_df['total_months'] - district_df['active_months']
district_df['activity_ratio'] = district_df['active_months'] / district_df['total_months']
district_df['zero_month_ratio'] = district_df['zero_months'] / district_df['total_months']
district_df['biometric_burden'] = (district_df['total_BU'] / (district_df['total_BU'] + district_df['total_DU'])).fillna(0)
district_df['update_dominant'] = np.where(district_df['total_U'] > district_df['total_E'], 1, 0)
district_df['enrollment_update_balance'] = (district_df['total_E'] / (district_df['total_E'] + district_df['total_U'])).fillna(0)

print(f'Districts computed: {len(district_df)}')

Districts computed: 3


## 4. DEI Score Calculation

In [24]:
def normalize(x):
    min_val, max_val = x.min(), x.max()
    if max_val == min_val:
        return pd.Series([0.5] * len(x), index=x.index)
    return (x - min_val) / (max_val - min_val)

def inverse_normalize(x):
    return 1 - normalize(x)

scores_df = district_df.copy()

# DEI Components
scores_df['access'] = (scores_df['activity_ratio'] + normalize(scores_df['avg_monthly_enrolment'])) / 2
scores_df['responsiveness'] = normalize(scores_df['total_U'] / scores_df['total_T'])
scores_df['inclusion'] = normalize((scores_df['sum_age_0_5'] + scores_df['sum_age_5_17']) / scores_df['total_E'])
scores_df['stability'] = (inverse_normalize(scores_df['monthly_volatility']) + inverse_normalize(scores_df['peak_load_ratio'])) / 2
scores_df['visibility'] = scores_df['activity_ratio']

# Final scores
scores_df['DEI'] = (scores_df['access'] + scores_df['responsiveness'] + scores_df['inclusion'] + scores_df['stability'] + scores_df['visibility']) / 5
scores_df['ASS'] = (inverse_normalize(scores_df['activity_ratio']) + inverse_normalize(scores_df['avg_monthly_enrolment'])) / 2
scores_df['UBS'] = (normalize(scores_df['biometric_burden']) + normalize(scores_df['update_dominant'])) / 2
scores_df['SRS'] = (normalize(scores_df['monthly_volatility']) + normalize(scores_df['zero_month_ratio'])) / 2

print('DEI calculated!')
scores_df[['district', 'DEI', 'ASS', 'UBS', 'SRS']].sort_values('DEI', ascending=False)

DEI calculated!


Unnamed: 0,district,DEI,ASS,UBS,SRS
1,puducherry,0.73934,0.25,0.558479,0.25
0,karaikal,0.578774,0.639357,0.75,0.466775
2,yanam,0.502419,0.75,0.25,0.75


In [25]:
print(f'=== Puducherry Summary ===')
print(f'Districts: {len(scores_df)}')
print(f'Avg DEI: {scores_df["DEI"].mean():.4f}')
print(f'Best: {scores_df.loc[scores_df["DEI"].idxmax(), "district"]} ({scores_df["DEI"].max():.4f})')
print(f'Worst: {scores_df.loc[scores_df["DEI"].idxmin(), "district"]} ({scores_df["DEI"].min():.4f})')

=== Puducherry Summary ===
Districts: 3
Avg DEI: 0.6068
Best: puducherry (0.7393)
Worst: yanam (0.5024)


In [26]:
# Save outputs
scores_df.to_csv(os.path.join(BASE_PATH, 'puducherry_district_analysis.csv'), index=False)
scores_df[['state', 'district', 'DEI', 'ASS', 'UBS', 'SRS']].to_csv(
    os.path.join(BASE_PATH, 'puducherry_district_final_scores.csv'), index=False)
print('✅ Saved!')

✅ Saved!
