# Andhra Pradesh State Analysis - Digital Equity Index (DEI)

This notebook performs analysis of Aadhaar enrollment and update data for Andhra Pradesh's **26 official districts**.

## Special Handling
- **Telangana Separation**: Districts belonging to Telangana (e.g., Hyderabad, Rangareddy) found in this dataset are filtered out and saved to `telangna_dist_in_andhra.csv` for later aggregation.
- **Normalization**: Variants like `Cuddapah` are mapped to `y.s.r. kadapa`.
- **Zero Drop**: All valid Andhra districts are mapped; none are dropped.

## Official Andhra Districts (26)
Alluri Sitharama Raju, Anakapalli, Ananthapuramu, Annamayya, Bapatla, Chittoor, Dr. B.R. Ambedkar Konaseema, East Godavari, Eluru, Guntur, Kakinada, Krishna, Kurnool, Nandyal, NTR, Palnadu, Parvathipuram Manyam, Prakasam, Sri Potti Sriramulu Nellore, Sri Sathya Sai, Srikakulam, Tirupati, Visakhapatnam, Vizianagaram, West Godavari, Y.S.R. Kadapa

In [9]:
import numpy as np
import pandas as pd
import os

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

## 1. Data Loading

In [10]:
BASE_PATH = r'c:\Users\Admin\OneDrive\Documents\AADHAR Hackathon\andhra'
DATA_PATH = os.path.join(BASE_PATH, 'data')

enrol_df = pd.read_csv(os.path.join(DATA_PATH, 'andhra_enrollment.csv'))
demo_df = pd.read_csv(os.path.join(DATA_PATH, 'andhra_demographic.csv'))
bio_df = pd.read_csv(os.path.join(DATA_PATH, 'andhra_biometric.csv'))

print(f'Enrollment: {len(enrol_df):,} | Demographic: {len(demo_df):,} | Biometric: {len(bio_df):,}')

Enrollment: 80,374 | Demographic: 231,428 | Biometric: 531,734


## 2. Data Cleaning & Separation

In [11]:
# Official 26 AP Districts (lowercase)
OFFICIAL_ANDHRA_DISTRICTS = {
    'alluri sitharama raju', 'anakapalli', 'ananthapuramu', 'annamayya',
    'bapatla', 'chittoor', 'dr. b.r. ambedkar konaseema', 'east godavari',
    'eluru', 'guntur', 'kakinada', 'krishna', 'kurnool', 'nandyal',
    'ntr', 'palnadu', 'parvathipuram manyam', 'prakasam',
    'sri potti sriramulu nellore', 'sri sathya sai', 'srikakulam',
    'tirupati', 'visakhapatnam', 'vizianagaram', 'west godavari',
    'y.s.r. kadapa'
}

# Known Telangana Districts (lowercase)
TELANGANA_DISTRICTS = {
    'adilabad', 'hyderabad', 'karimnagar', 'khammam', 'mahabubnagar',
    'medak', 'nalgonda', 'nizamabad', 'rangareddy', 'warangal',
    'k.v.rangareddy', 'mahabub nagar', 'karim nagar'
}

# Mapping Logic
DISTRICT_CLEANUP_MAP = {
    # --- Andhra Normalization ---
    'anantapur': 'ananthapuramu',
    'ananthapur': 'ananthapuramu',
    'cuddapah': 'y.s.r. kadapa',
    'y. s. r': 'y.s.r. kadapa',
    'dr. b. r. ambedkar konaseema': 'dr. b.r. ambedkar konaseema',
    'n. t. r': 'ntr',
    'nellore': 'sri potti sriramulu nellore',
    'spsr nellore': 'sri potti sriramulu nellore',
    'visakhapatanam': 'visakhapatnam',
    
    # --- Telangana Normalization (for cleaner separation) ---
    'k.v. rangareddy': 'rangareddy',
    'k.v.rangareddy': 'rangareddy',
    'rangareddi': 'rangareddy',
    'karim nagar': 'karimnagar',
    'mahabub nagar': 'mahabubnagar',
    'mahbubnagar': 'mahabubnagar'
}

def normalize_name(name):
    if pd.isna(name): return None
    cleaned = str(name).strip().lower()
    if cleaned.endswith(' *'): cleaned = cleaned[:-2].strip()
    if cleaned.endswith('*'): cleaned = cleaned[:-1].strip()
    if cleaned in DISTRICT_CLEANUP_MAP: cleaned = DISTRICT_CLEANUP_MAP[cleaned]
    return cleaned

telangana_data = []

for df_name, df in [('Enrollment', enrol_df), ('Demographic', demo_df), ('Biometric', bio_df)]:
    # 1. Normalize
    df['district_norm'] = df['district'].apply(normalize_name)
    
    # 2. Identify Telangana Records
    tg_mask = df['district_norm'].isin(TELANGANA_DISTRICTS)
    tg_df = df[tg_mask].copy()
    tg_df['source_file'] = df_name
    telangana_data.append(tg_df)
    
    # 3. Filter Main DF to Andhra Only
    # Keep only legitimate Andhra districts
    df['is_andhra'] = df['district_norm'].isin(OFFICIAL_ANDHRA_DISTRICTS)
    
    # Log Drops (should only be Telangana or Junk)
    dropped = df[~df['is_andhra']]['district'].unique()
    # We separate TG drops from Unknown drops for clarity
    unknown_drops = [d for d in dropped if normalize_name(d) not in TELANGANA_DISTRICTS]
    
    if len(unknown_drops) > 0:
        print(f'{df_name}: Unknown districts dropped: {unknown_drops}')
        
    # Apply Filter
    df.query('is_andhra == True', inplace=True)
    df['district'] = df['district_norm']
    df.drop(columns=['district_norm', 'is_andhra'], inplace=True)
    
    # Dates
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)
    df['month'] = df['date'].dt.month

# Combine and Save Telangana Data
if telangana_data:
    all_tg = pd.concat(telangana_data)
    tg_out_path = os.path.join(BASE_PATH, 'telangna_dist_in_andhra.csv')
    all_tg.to_csv(tg_out_path, index=False)
    print(f'Extracted {len(all_tg)} Telangana records to: {tg_out_path}')
    print(f'Telangana districts found: {sorted(all_tg["district"].unique())}')
else:
    print('No Telangana data found.')

# Verify Andhra Data
final_districts = sorted(enrol_df['district'].unique())
print(f'\nFinal Andhra Districts ({len(final_districts)}):')
print(final_districts)
if len(final_districts) != 26:
    print(f'Warning: Expected 26, found {len(final_districts)}')
else:
    print('Exactly 26 Andhra districts retained.')

Extracted 181832 Telangana records to: c:\Users\Admin\OneDrive\Documents\AADHAR Hackathon\andhra\telangna_dist_in_andhra.csv
Telangana districts found: ['Adilabad', 'Hyderabad', 'K.V.Rangareddy', 'K.v. Rangareddy', 'Karim Nagar', 'Karimnagar', 'Khammam', 'Mahabub Nagar', 'Mahabubnagar', 'Mahbubnagar', 'Medak', 'Nalgonda', 'Nizamabad', 'Rangareddi', 'Warangal']

Final Andhra Districts (26):
['alluri sitharama raju', 'anakapalli', 'ananthapuramu', 'annamayya', 'bapatla', 'chittoor', 'dr. b.r. ambedkar konaseema', 'east godavari', 'eluru', 'guntur', 'kakinada', 'krishna', 'kurnool', 'nandyal', 'ntr', 'palnadu', 'parvathipuram manyam', 'prakasam', 'sri potti sriramulu nellore', 'sri sathya sai', 'srikakulam', 'tirupati', 'visakhapatnam', 'vizianagaram', 'west godavari', 'y.s.r. kadapa']
Exactly 26 Andhra districts retained.


## 3. Aggregation & Metrics

In [12]:
# Aggregate
enrol_agg = enrol_df.groupby(['state', 'district', 'month'])[['age_0_5', 'age_5_17', 'age_18_greater']].sum().reset_index()
demo_agg = demo_df.groupby(['state', 'district', 'month'])[['demo_age_5_17', 'demo_age_17_']].sum().reset_index()
bio_agg = bio_df.groupby(['state', 'district', 'month'])[['bio_age_5_17', 'bio_age_17_']].sum().reset_index()

combined_df = enrol_agg.merge(demo_agg, on=['state', 'district', 'month'], how='outer') \
                       .merge(bio_agg, on=['state', 'district', 'month'], how='outer')
combined_df.fillna(0, inplace=True)

# Core metrics
combined_df['E'] = combined_df['age_0_5'] + combined_df['age_5_17'] + combined_df['age_18_greater']
combined_df['DU'] = combined_df['demo_age_5_17'] + combined_df['demo_age_17_']
combined_df['BU'] = combined_df['bio_age_5_17'] + combined_df['bio_age_17_']
combined_df['U'] = combined_df['DU'] + combined_df['BU']
combined_df['T'] = combined_df['E'] + combined_df['U']

print(f'Combined records: {len(combined_df)}')

Combined records: 286


In [13]:
# District-level aggregation
district_df = combined_df.groupby(['state', 'district']).agg(
    total_months=('month', 'count'),
    active_months=('T', lambda x: (x > 0).sum()),
    total_E=('E', 'sum'), total_DU=('DU', 'sum'), total_BU=('BU', 'sum'),
    total_U=('U', 'sum'), total_T=('T', 'sum'),
    avg_monthly_enrolment=('E', 'mean'),
    monthly_volatility=('T', lambda x: x.std(ddof=0) / x.mean() if x.mean() > 0 else 0),
    peak_load_ratio=('T', lambda x: x.max() / x.mean() if x.mean() > 0 else 0),
    sum_age_0_5=('age_0_5', 'sum'), sum_age_5_17=('age_5_17', 'sum')
).reset_index()

district_df['zero_months'] = district_df['total_months'] - district_df['active_months']
district_df['activity_ratio'] = district_df['active_months'] / district_df['total_months']
district_df['zero_month_ratio'] = district_df['zero_months'] / district_df['total_months']
district_df['biometric_burden'] = (district_df['total_BU'] / (district_df['total_BU'] + district_df['total_DU'])).fillna(0)
district_df['update_dominant'] = np.where(district_df['total_U'] > district_df['total_E'], 1, 0)
district_df['enrollment_update_balance'] = (district_df['total_E'] / (district_df['total_E'] + district_df['total_U'])).fillna(0)

print(f'Districts computed: {len(district_df)}')

Districts computed: 26


## 4. DEI Score Calculation

In [14]:
def normalize(x):
    min_val, max_val = x.min(), x.max()
    if max_val == min_val:
        return pd.Series([0.5] * len(x), index=x.index)
    return (x - min_val) / (max_val - min_val)

def inverse_normalize(x):
    return 1 - normalize(x)

scores_df = district_df.copy()

# DEI Components
scores_df['access'] = (scores_df['activity_ratio'] + normalize(scores_df['avg_monthly_enrolment'])) / 2
scores_df['responsiveness'] = normalize(scores_df['total_U'] / scores_df['total_T'])
scores_df['inclusion'] = normalize((scores_df['sum_age_0_5'] + scores_df['sum_age_5_17']) / scores_df['total_E'])
scores_df['stability'] = (inverse_normalize(scores_df['monthly_volatility']) + inverse_normalize(scores_df['peak_load_ratio'])) / 2
scores_df['visibility'] = scores_df['activity_ratio']

# Final scores
scores_df['DEI'] = (scores_df['access'] + scores_df['responsiveness'] + scores_df['inclusion'] + scores_df['stability'] + scores_df['visibility']) / 5
scores_df['ASS'] = (inverse_normalize(scores_df['activity_ratio']) + inverse_normalize(scores_df['avg_monthly_enrolment'])) / 2
scores_df['UBS'] = (normalize(scores_df['biometric_burden']) + normalize(scores_df['update_dominant'])) / 2
scores_df['SRS'] = (normalize(scores_df['monthly_volatility']) + normalize(scores_df['zero_month_ratio'])) / 2

print('DEI calculated!')
scores_df[['district', 'DEI', 'ASS', 'UBS', 'SRS']].sort_values('DEI', ascending=False)

DEI calculated!


Unnamed: 0,district,DEI,ASS,UBS,SRS
12,kurnool,0.895586,0.25,0.734106,0.25
7,east godavari,0.883789,0.428361,0.732177,0.330141
22,visakhapatnam,0.877066,0.408122,0.75,0.360677
24,west godavari,0.874527,0.514978,0.694767,0.344637
11,krishna,0.872589,0.476511,0.740478,0.259305
20,srikakulam,0.868227,0.58397,0.638117,0.403759
9,guntur,0.863981,0.34429,0.688101,0.324108
5,chittoor,0.852958,0.427925,0.643527,0.363092
23,vizianagaram,0.850974,0.607057,0.696138,0.377012
2,ananthapuramu,0.847769,0.346602,0.707615,0.396899


In [15]:
print(f'=== Andhra Pradesh Summary ===')
print(f'Districts: {len(scores_df)}')
print(f'Avg DEI: {scores_df["DEI"].mean():.4f}')
print(f'Best: {scores_df.loc[scores_df["DEI"].idxmax(), "district"]} ({scores_df["DEI"].max():.4f})')
print(f'Worst: {scores_df.loc[scores_df["DEI"].idxmin(), "district"]} ({scores_df["DEI"].min():.4f})')

=== Andhra Pradesh Summary ===
Districts: 26
Avg DEI: 0.7362
Best: kurnool (0.8956)
Worst: tirupati (0.3727)


In [16]:
# Save outputs
scores_df.to_csv(os.path.join(BASE_PATH, 'andhra_district_analysis.csv'), index=False)
scores_df[['state', 'district', 'DEI', 'ASS', 'UBS', 'SRS']].to_csv(
    os.path.join(BASE_PATH, 'andhra_district_final_scores.csv'), index=False)
print('Saved!')

Saved!
