# Karnataka State Analysis - Digital Equity Index (DEI)

This notebook performs analysis of Aadhaar enrollment and update data for Karnataka's **31 official districts**.

## Official Karnataka Districts (31)
1. Bagalkote
2. Ballari
3. Belagavi
4. Bengaluru Rural
5. Bengaluru Urban
6. Bidar
7. Chamarajanagar
8. Chikkaballapura
9. Chikkamagaluru
10. Chitradurga
11. Dakshina Kannada
12. Davanagere
13. Dharwad
14. Gadag
15. Hassan
16. Haveri
17. Kalaburagi
18. Kodagu
19. Kolar
20. Koppal
21. Mandya
22. Mysuru
23. Raichur
24. Ramanagara
25. Shivamogga
26. Tumakuru
27. Udupi
28. Uttara Kannada
29. Vijayanagara
30. Vijayapura
31. Yadgir

In [78]:
import numpy as np
import pandas as pd
import os

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

## 1. Data Loading

In [79]:
# Use the 'data' subdirectory
BASE_PATH = r'c:\Users\Admin\OneDrive\Documents\AADHAR Hackathon\karnataka'
DATA_PATH = os.path.join(BASE_PATH, 'data')

enrol_df = pd.read_csv(os.path.join(DATA_PATH, 'karnataka_enrollment.csv'))
demo_df = pd.read_csv(os.path.join(DATA_PATH, 'karnataka_demographic.csv'))
bio_df = pd.read_csv(os.path.join(DATA_PATH, 'karnataka_biometric.csv'))

print(f'Enrollment: {len(enrol_df):,} | Demographic: {len(demo_df):,} | Biometric: {len(bio_df):,}')

Enrollment: 86,772 | Demographic: 181,401 | Biometric: 429,837


In [80]:
print('Raw district names sample:', sorted(enrol_df['district'].unique())[:10])

Raw district names sample: ['Bagalkot', 'Bagalkot *', 'Ballari', 'Bangalore', 'Bangalore Rural', 'Belagavi', 'Belgaum', 'Bellary', 'Bengaluru', 'Bengaluru Rural']


## 2. Data Cleaning - Comprehensive Mapping

In [81]:
# Official 31 Karnataka Districts (lowercase)
OFFICIAL_DISTRICTS = {
    'bagalkote', 'ballari', 'belagavi', 'bengaluru rural', 'bengaluru urban',
    'bidar', 'chamarajanagar', 'chikkaballapura', 'chikkamagaluru', 'chitradurga',
    'dakshina kannada', 'davanagere', 'dharwad', 'gadag', 'hassan', 'haveri',
    'kalaburagi', 'kodagu', 'kolar', 'koppal', 'mandya', 'mysuru', 'raichur',
    'ramanagara', 'shivamogga', 'tumakuru', 'udupi', 'uttara kannada',
    'vijayanagara', 'vijayapura', 'yadgir'
}

# Comprehensive Mapping: variants -> official 31 district names
DISTRICT_CLEANUP_MAP = {
    # --- Explicit User Requests ---
    'hasan': 'hassan',
    'davangere': 'davanagere',
    
    # --- Standard Variations ---
    'bagalkot': 'bagalkote',
    'bellary': 'ballari',
    'belgaum': 'belagavi',
    
    # Bengaluru Handling
    'bangalore': 'bengaluru urban',
    'bangalore urban': 'bengaluru urban',
    'bengaluru': 'bengaluru urban',
    'bengaluru south': 'bengaluru urban',
    'bengaluru north': 'bengaluru urban',
    'bengaluru central': 'bengaluru urban',
    'bengaluru east': 'bengaluru urban',
    'bengaluru west': 'bengaluru urban',
    'bangalore rural': 'bengaluru rural',
    
    # Other Renaming/Spellings
    'chamrajnagar': 'chamarajanagar',
    'chamrajanagar': 'chamarajanagar',
    'chamarajanagara': 'chamarajanagar',
    'chikkaballapur': 'chikkaballapura',
    'chikmagalur': 'chikkamagaluru',
    'chickmagalur': 'chikkamagaluru',
    'gulbarga': 'kalaburagi',
    'mysore': 'mysuru',
    'ramanagar': 'ramanagara',
    'shimoga': 'shivamogga',
    'tumkur': 'tumakuru',
    'bijapur': 'vijayapura',
    'bijapur(kar)': 'vijayapura',
}

def clean_district_name(name):
    """Normalize district name. Handles typos, whitespace, asterisks."""
    if pd.isna(name):
        return None
    
    # Lowercase and strip
    cleaned = str(name).strip().lower()
    
    # Remove asterisk
    if cleaned.endswith(' *'):
        cleaned = cleaned[:-2].strip()
    if cleaned.endswith('*'):
        cleaned = cleaned[:-1].strip()
    
    # Apply mapping
    if cleaned in DISTRICT_CLEANUP_MAP:
        cleaned = DISTRICT_CLEANUP_MAP[cleaned]
        
    return cleaned

print(f'Official districts: {len(OFFICIAL_DISTRICTS)}')
print(f'Cleanup mappings: {len(DISTRICT_CLEANUP_MAP)}')

Official districts: 31
Cleanup mappings: 27


In [82]:
# Apply cleaning and VALIDATE (No Blind Drops)
for df in [enrol_df, demo_df, bio_df]:
    # 1. Clean
    df['district_clean'] = df['district'].apply(clean_district_name)
    
    # 2. Check for unknowns BEFORE dropping
    unknowns = df[~df['district_clean'].isin(OFFICIAL_DISTRICTS)]['district'].unique()
    if len(unknowns) > 0:
        print(f'⚠️ CRITICAL: The following districts are still NOT mapped to the official list:\n{unknowns}')
        print('Please update DISTRICT_CLEANUP_MAP.')
        # NOTE: We keep going, but filter them out for the final analysis as they aren't official districts.
        # The goal is to maximize matches first.
        
    # 3. Apply
    df.dropna(subset=['district_clean'], inplace=True)
    df['district'] = df['district_clean']
    df.drop(columns=['district_clean'], inplace=True)

    # 4. Dates
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)
    df['month'] = df['date'].dt.month

# Verify
all_cleaned = set(enrol_df['district'].unique()) | set(demo_df['district'].unique()) | set(bio_df['district'].unique())

print(f'\nFinal Districts ({len(all_cleaned)}):')
print(sorted(all_cleaned))

if len(all_cleaned) != 31:
    print(f'⚠️ Warning: Found {len(all_cleaned)} districts, expected 31. Check if some official districts are absent in data.')
else:
    print('✅ Exactly 31 districts found! No data lost from known districts.')


Final Districts (31):
['bagalkote', 'ballari', 'belagavi', 'bengaluru rural', 'bengaluru urban', 'bidar', 'chamarajanagar', 'chikkaballapura', 'chikkamagaluru', 'chitradurga', 'dakshina kannada', 'davanagere', 'dharwad', 'gadag', 'hassan', 'haveri', 'kalaburagi', 'kodagu', 'kolar', 'koppal', 'mandya', 'mysuru', 'raichur', 'ramanagara', 'shivamogga', 'tumakuru', 'udupi', 'uttara kannada', 'vijayanagara', 'vijayapura', 'yadgir']
✅ Exactly 31 districts found! No data lost from known districts.


## 3. Aggregation & Metrics

In [83]:
# Aggregate
enrol_agg = enrol_df.groupby(['state', 'district', 'month'])[['age_0_5', 'age_5_17', 'age_18_greater']].sum().reset_index()
demo_agg = demo_df.groupby(['state', 'district', 'month'])[['demo_age_5_17', 'demo_age_17_']].sum().reset_index()
bio_agg = bio_df.groupby(['state', 'district', 'month'])[['bio_age_5_17', 'bio_age_17_']].sum().reset_index()

combined_df = enrol_agg.merge(demo_agg, on=['state', 'district', 'month'], how='outer') \
                       .merge(bio_agg, on=['state', 'district', 'month'], how='outer')
combined_df.fillna(0, inplace=True)

# Core metrics
combined_df['E'] = combined_df['age_0_5'] + combined_df['age_5_17'] + combined_df['age_18_greater']
combined_df['DU'] = combined_df['demo_age_5_17'] + combined_df['demo_age_17_']
combined_df['BU'] = combined_df['bio_age_5_17'] + combined_df['bio_age_17_']
combined_df['U'] = combined_df['DU'] + combined_df['BU']
combined_df['T'] = combined_df['E'] + combined_df['U']

print(f'Combined records: {len(combined_df)}')

Combined records: 341


In [84]:
# District-level aggregation
district_df = combined_df.groupby(['state', 'district']).agg(
    total_months=('month', 'count'),
    active_months=('T', lambda x: (x > 0).sum()),
    total_E=('E', 'sum'), total_DU=('DU', 'sum'), total_BU=('BU', 'sum'),
    total_U=('U', 'sum'), total_T=('T', 'sum'),
    avg_monthly_enrolment=('E', 'mean'),
    monthly_volatility=('T', lambda x: x.std(ddof=0) / x.mean() if x.mean() > 0 else 0),
    peak_load_ratio=('T', lambda x: x.max() / x.mean() if x.mean() > 0 else 0),
    sum_age_0_5=('age_0_5', 'sum'), sum_age_5_17=('age_5_17', 'sum')
).reset_index()

district_df['zero_months'] = district_df['total_months'] - district_df['active_months']
district_df['activity_ratio'] = district_df['active_months'] / district_df['total_months']
district_df['zero_month_ratio'] = district_df['zero_months'] / district_df['total_months']
district_df['biometric_burden'] = (district_df['total_BU'] / (district_df['total_BU'] + district_df['total_DU'])).fillna(0)
district_df['update_dominant'] = np.where(district_df['total_U'] > district_df['total_E'], 1, 0)
district_df['enrollment_update_balance'] = (district_df['total_E'] / (district_df['total_E'] + district_df['total_U'])).fillna(0)

print(f'Districts computed: {len(district_df)}')

Districts computed: 31


## 4. DEI Score Calculation

In [85]:
def normalize(x):
    min_val, max_val = x.min(), x.max()
    if max_val == min_val:
        return pd.Series([0.5] * len(x), index=x.index)
    return (x - min_val) / (max_val - min_val)

def inverse_normalize(x):
    return 1 - normalize(x)

scores_df = district_df.copy()

# DEI Components
scores_df['access'] = (scores_df['activity_ratio'] + normalize(scores_df['avg_monthly_enrolment'])) / 2
scores_df['responsiveness'] = normalize(scores_df['total_U'] / scores_df['total_T'])
scores_df['inclusion'] = normalize((scores_df['sum_age_0_5'] + scores_df['sum_age_5_17']) / scores_df['total_E'])
scores_df['stability'] = (inverse_normalize(scores_df['monthly_volatility']) + inverse_normalize(scores_df['peak_load_ratio'])) / 2
scores_df['visibility'] = scores_df['activity_ratio']

# Final scores
scores_df['DEI'] = (scores_df['access'] + scores_df['responsiveness'] + scores_df['inclusion'] + scores_df['stability'] + scores_df['visibility']) / 5
scores_df['ASS'] = (inverse_normalize(scores_df['activity_ratio']) + inverse_normalize(scores_df['avg_monthly_enrolment'])) / 2
scores_df['UBS'] = (normalize(scores_df['biometric_burden']) + normalize(scores_df['update_dominant'])) / 2
scores_df['SRS'] = (normalize(scores_df['monthly_volatility']) + normalize(scores_df['zero_month_ratio'])) / 2

print('DEI calculated!')
scores_df[['district', 'DEI', 'ASS', 'UBS', 'SRS']].sort_values('DEI', ascending=False)

DEI calculated!


Unnamed: 0,district,DEI,ASS,UBS,SRS
11,davanagere,0.889525,0.710827,0.75,0.25
14,hassan,0.796879,0.727087,0.45728,0.506099
12,dharwad,0.790904,0.711882,0.618316,0.453204
15,haveri,0.787646,0.719715,0.427353,0.511733
17,kodagu,0.778471,0.75,0.579139,0.405629
18,kolar,0.775791,0.723338,0.596985,0.531076
20,mandya,0.772731,0.726954,0.610384,0.61169
25,tumakuru,0.760177,0.701947,0.67205,0.561379
16,kalaburagi,0.754818,0.670697,0.65823,0.416671
8,chikkamagaluru,0.754086,0.737742,0.652338,0.604178


## 5. Summary & Output

In [86]:
print(f'=== Karnataka Summary ===')
print(f'Districts: {len(scores_df)}')
print(f'Avg DEI: {scores_df["DEI"].mean():.4f}')
print(f'Best: {scores_df.loc[scores_df["DEI"].idxmax(), "district"]} ({scores_df["DEI"].max():.4f})')
print(f'Worst: {scores_df.loc[scores_df["DEI"].idxmin(), "district"]} ({scores_df["DEI"].min():.4f})')

=== Karnataka Summary ===
Districts: 31
Avg DEI: 0.7101
Best: davanagere (0.8895)
Worst: bengaluru rural (0.4022)


In [87]:
# Save outputs
# IMPORTANT: Save to the PARENT directory, not the data directory
scores_df.to_csv(os.path.join(BASE_PATH, 'karnataka_district_analysis.csv'), index=False)
scores_df[['state', 'district', 'DEI', 'ASS', 'UBS', 'SRS']].to_csv(
    os.path.join(BASE_PATH, 'karnataka_district_final_scores.csv'), index=False)
print('✅ Saved!')

✅ Saved!
