# Tamil Nadu State Analysis - Digital Equity Index (DEI)

This notebook performs analysis of Aadhaar enrollment and update data for Tamil Nadu's **38 official districts**.

## Official Tamil Nadu Districts (38)
Ariyalur, Chengalpattu, Chennai, Coimbatore, Cuddalore, Dharmapuri, Dindigul, Erode, Kallakurichi, Kanchipuram, Kanyakumari, Karur, Krishnagiri, Madurai, Mayiladuthurai, Nagapattinam, Namakkal, Nilgiris, Perambalur, Pudukkottai, Ramanathapuram, Ranipet, Salem, Sivagangai, Tenkasi, Thanjavur, Theni, Thoothukudi, Tiruchirappalli, Tirunelveli, Tirupattur, Tiruppur, Tiruvallur, Tiruvannamalai, Tiruvarur, Vellore, Viluppuram, Virudhunagar.

In [1]:
import numpy as np
import pandas as pd
import os

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

## 1. Data Loading

In [2]:
BASE_PATH = r'c:\Users\Admin\OneDrive\Documents\AADHAR Hackathon\tamilnadu'
# Note: Files are in root of tamilnadu folder based on scan

enrol_df = pd.read_csv(os.path.join(BASE_PATH, 'tamilnadu_enrollment.csv'))
demo_df = pd.read_csv(os.path.join(BASE_PATH, 'tamilnadu_demographic.csv'))
bio_df = pd.read_csv(os.path.join(BASE_PATH, 'tamilnadu_biometric.csv'))

print(f'Enrollment: {len(enrol_df):,} | Demographic: {len(demo_df):,} | Biometric: {len(bio_df):,}')

Enrollment: 111,325 | Demographic: 232,752 | Biometric: 534,767


In [3]:
print('Raw district names sample:', sorted(enrol_df['district'].unique())[:10])

Raw district names sample: ['Ariyalur', 'Chengalpattu', 'Chennai', 'Coimbatore', 'Cuddalore', 'Dharmapuri', 'Dindigul', 'Erode', 'Kallakurichi', 'Kancheepuram']


## 2. Data Cleaning - Comprehensive Mapping

In [4]:
# Official 38 Tamil Nadu Districts (lowercase)
OFFICIAL_DISTRICTS = {
    'ariyalur', 'chengalpattu', 'chennai', 'coimbatore', 'cuddalore',
    'dharmapuri', 'dindigul', 'erode', 'kallakurichi', 'kanchipuram',
    'kanyakumari', 'karur', 'krishnagiri', 'madurai', 'mayiladuthurai',
    'nagapattinam', 'namakkal', 'nilgiris', 'perambalur', 'pudukkottai',
    'ramanathapuram', 'ranipet', 'salem', 'sivagangai', 'tenkasi',
    'thanjavur', 'theni', 'thoothukudi', 'tiruchirappalli', 'tirunelveli',
    'tirupattur', 'tiruppur', 'tiruvallur', 'tiruvannamalai', 'tiruvarur',
    'vellore', 'viluppuram', 'virudhunagar'
}

# Comprehensive Mapping: known variants -> official 38 district names
DISTRICT_CLEANUP_MAP = {
    # Found in Raw Data Scan
    'kancheepuram': 'kanchipuram',
    'kanniyakumari': 'kanyakumari',
    'sivaganga': 'sivagangai',
    'the nilgiris': 'nilgiris',
    'thiruvallur': 'tiruvallur',
    'thiruvarur': 'tiruvarur',
    'thoothukkudi': 'thoothukudi',
    'tuticorin': 'thoothukudi',
    'tirupathur': 'tirupattur',
    'villupuram': 'viluppuram',
    
    # Potential backups
    'tiruneveli': 'tirunelveli',
    'thiruvannamalai': 'tiruvannamalai',
    'virudhunagar *': 'virudhunagar' # Asterisk handling is mostly done by function
}

def clean_district_name(name):
    """Normalize district name. Handles typos, whitespace, asterisks."""
    if pd.isna(name):
        return None
    
    # Lowercase and strip
    cleaned = str(name).strip().lower()
    
    # Remove asterisk suffix
    if cleaned.endswith(' *'):
        cleaned = cleaned[:-2].strip()
    if cleaned.endswith('*'):
        cleaned = cleaned[:-1].strip()
    
    # Apply mapping
    if cleaned in DISTRICT_CLEANUP_MAP:
        cleaned = DISTRICT_CLEANUP_MAP[cleaned]
        
    return cleaned

print(f'Official districts: {len(OFFICIAL_DISTRICTS)}')
print(f'Cleanup mappings: {len(DISTRICT_CLEANUP_MAP)}')

Official districts: 38
Cleanup mappings: 13


In [5]:
# Apply cleaning and VALIDATE (Zero Drop Policy)
for df in [enrol_df, demo_df, bio_df]:
    # 1. Clean
    df['district_clean'] = df['district'].apply(clean_district_name)
    
    # 2. Check for unknowns
    unknowns = df[~df['district_clean'].isin(OFFICIAL_DISTRICTS)]['district'].unique()
    if len(unknowns) > 0:
        print(f'⚠️ CRITICAL: The following districts are still NOT mapped to the official list:\n{unknowns}')
        print('Please update DISTRICT_CLEANUP_MAP.')
        
    # 3. Apply
    df.dropna(subset=['district_clean'], inplace=True)
    df['district'] = df['district_clean']
    df.drop(columns=['district_clean'], inplace=True)

    # 4. Dates
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)
    df['month'] = df['date'].dt.month

# Verify
all_cleaned = set(enrol_df['district'].unique()) | set(demo_df['district'].unique()) | set(bio_df['district'].unique())

print(f'\nFinal Districts ({len(all_cleaned)}):')
print(sorted(all_cleaned))

if len(all_cleaned) != 38:
    print(f'⚠️ Warning: Found {len(all_cleaned)} districts, expected 38. Check if some official districts are absent in data.')
else:
    print('✅ Exactly 38 districts found! No data lost from known districts.')


Final Districts (38):
['ariyalur', 'chengalpattu', 'chennai', 'coimbatore', 'cuddalore', 'dharmapuri', 'dindigul', 'erode', 'kallakurichi', 'kanchipuram', 'kanyakumari', 'karur', 'krishnagiri', 'madurai', 'mayiladuthurai', 'nagapattinam', 'namakkal', 'nilgiris', 'perambalur', 'pudukkottai', 'ramanathapuram', 'ranipet', 'salem', 'sivagangai', 'tenkasi', 'thanjavur', 'theni', 'thoothukudi', 'tiruchirappalli', 'tirunelveli', 'tirupattur', 'tiruppur', 'tiruvallur', 'tiruvannamalai', 'tiruvarur', 'vellore', 'viluppuram', 'virudhunagar']
✅ Exactly 38 districts found! No data lost from known districts.


## 3. Aggregation & Metrics

In [6]:
# Aggregate
enrol_agg = enrol_df.groupby(['state', 'district', 'month'])[['age_0_5', 'age_5_17', 'age_18_greater']].sum().reset_index()
demo_agg = demo_df.groupby(['state', 'district', 'month'])[['demo_age_5_17', 'demo_age_17_']].sum().reset_index()
bio_agg = bio_df.groupby(['state', 'district', 'month'])[['bio_age_5_17', 'bio_age_17_']].sum().reset_index()

combined_df = enrol_agg.merge(demo_agg, on=['state', 'district', 'month'], how='outer') \
                       .merge(bio_agg, on=['state', 'district', 'month'], how='outer')
combined_df.fillna(0, inplace=True)

# Core metrics
combined_df['E'] = combined_df['age_0_5'] + combined_df['age_5_17'] + combined_df['age_18_greater']
combined_df['DU'] = combined_df['demo_age_5_17'] + combined_df['demo_age_17_']
combined_df['BU'] = combined_df['bio_age_5_17'] + combined_df['bio_age_17_']
combined_df['U'] = combined_df['DU'] + combined_df['BU']
combined_df['T'] = combined_df['E'] + combined_df['U']

print(f'Combined records: {len(combined_df)}')

Combined records: 418


In [7]:
# District-level aggregation
district_df = combined_df.groupby(['state', 'district']).agg(
    total_months=('month', 'count'),
    active_months=('T', lambda x: (x > 0).sum()),
    total_E=('E', 'sum'), total_DU=('DU', 'sum'), total_BU=('BU', 'sum'),
    total_U=('U', 'sum'), total_T=('T', 'sum'),
    avg_monthly_enrolment=('E', 'mean'),
    monthly_volatility=('T', lambda x: x.std(ddof=0) / x.mean() if x.mean() > 0 else 0),
    peak_load_ratio=('T', lambda x: x.max() / x.mean() if x.mean() > 0 else 0),
    sum_age_0_5=('age_0_5', 'sum'), sum_age_5_17=('age_5_17', 'sum')
).reset_index()

district_df['zero_months'] = district_df['total_months'] - district_df['active_months']
district_df['activity_ratio'] = district_df['active_months'] / district_df['total_months']
district_df['zero_month_ratio'] = district_df['zero_months'] / district_df['total_months']
district_df['biometric_burden'] = (district_df['total_BU'] / (district_df['total_BU'] + district_df['total_DU'])).fillna(0)
district_df['update_dominant'] = np.where(district_df['total_U'] > district_df['total_E'], 1, 0)
district_df['enrollment_update_balance'] = (district_df['total_E'] / (district_df['total_E'] + district_df['total_U'])).fillna(0)

print(f'Districts computed: {len(district_df)}')

Districts computed: 38


## 4. DEI Score Calculation

In [8]:
def normalize(x):
    min_val, max_val = x.min(), x.max()
    if max_val == min_val:
        return pd.Series([0.5] * len(x), index=x.index)
    return (x - min_val) / (max_val - min_val)

def inverse_normalize(x):
    return 1 - normalize(x)

scores_df = district_df.copy()

# DEI Components
scores_df['access'] = (scores_df['activity_ratio'] + normalize(scores_df['avg_monthly_enrolment'])) / 2
scores_df['responsiveness'] = normalize(scores_df['total_U'] / scores_df['total_T'])
scores_df['inclusion'] = normalize((scores_df['sum_age_0_5'] + scores_df['sum_age_5_17']) / scores_df['total_E'])
scores_df['stability'] = (inverse_normalize(scores_df['monthly_volatility']) + inverse_normalize(scores_df['peak_load_ratio'])) / 2
scores_df['visibility'] = scores_df['activity_ratio']

# Final scores
scores_df['DEI'] = (scores_df['access'] + scores_df['responsiveness'] + scores_df['inclusion'] + scores_df['stability'] + scores_df['visibility']) / 5
scores_df['ASS'] = (inverse_normalize(scores_df['activity_ratio']) + inverse_normalize(scores_df['avg_monthly_enrolment'])) / 2
scores_df['UBS'] = (normalize(scores_df['biometric_burden']) + normalize(scores_df['update_dominant'])) / 2
scores_df['SRS'] = (normalize(scores_df['monthly_volatility']) + normalize(scores_df['zero_month_ratio'])) / 2

print('DEI calculated!')
scores_df[['district', 'DEI', 'ASS', 'UBS', 'SRS']].sort_values('DEI', ascending=False)

DEI calculated!


Unnamed: 0,district,DEI,ASS,UBS,SRS
6,dindigul,0.923018,0.525373,0.707861,0.265421
7,erode,0.88277,0.532295,0.624074,0.336656
10,kanyakumari,0.876555,0.594133,0.589423,0.325753
29,tirunelveli,0.87253,0.401696,0.677869,0.343898
18,perambalur,0.870892,0.668183,0.75,0.25
16,namakkal,0.869398,0.581379,0.66987,0.326433
31,tiruppur,0.868071,0.526612,0.563351,0.336965
13,madurai,0.860751,0.377214,0.606058,0.327866
3,coimbatore,0.849163,0.417775,0.584157,0.297478
12,krishnagiri,0.849147,0.549946,0.603663,0.351007


## 5. Summary & Output

In [9]:
print(f'=== Tamil Nadu Summary ===')
print(f'Districts: {len(scores_df)}')
print(f'Avg DEI: {scores_df["DEI"].mean():.4f}')
print(f'Best: {scores_df.loc[scores_df["DEI"].idxmax(), "district"]} ({scores_df["DEI"].max():.4f})')
print(f'Worst: {scores_df.loc[scores_df["DEI"].idxmin(), "district"]} ({scores_df["DEI"].min():.4f})')

=== Tamil Nadu Summary ===
Districts: 38
Avg DEI: 0.7776
Best: dindigul (0.9230)
Worst: chengalpattu (0.4246)


In [10]:
# Save outputs
scores_df.to_csv(os.path.join(BASE_PATH, 'tamilnadu_district_analysis.csv'), index=False)
scores_df[['state', 'district', 'DEI', 'ASS', 'UBS', 'SRS']].to_csv(
    os.path.join(BASE_PATH, 'tamilnadu_district_final_scores.csv'), index=False)
print('✅ Saved!')

✅ Saved!
