# Andaman & Nicobar Islands Analysis - Digital Equity Index (DEI)

This notebook performs analysis of Aadhaar enrollment and update data for **Andaman & Nicobar Islands (UT)**.

## Official Districts (3)
Nicobars, North and Middle Andaman, South Andaman

## Data Sources
Uses **Unified/Merged** datasets (duplicates removed from `andaman_1` & `andaman_2` sets):
- `andaman_enrollment.csv`
- `andaman_demographic.csv`
- `andaman_biometric.csv`

In [1]:
import numpy as np
import pandas as pd
import os

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

## 1. Data Loading (Unified)

In [2]:
BASE_PATH = r'c:\Users\Admin\OneDrive\Documents\AADHAR Hackathon\andaman'
DATA_PATH = os.path.join(BASE_PATH, 'data')

enrol_df = pd.read_csv(os.path.join(DATA_PATH, 'andaman_enrollment.csv'))
demo_df = pd.read_csv(os.path.join(DATA_PATH, 'andaman_demographic.csv'))
bio_df = pd.read_csv(os.path.join(DATA_PATH, 'andaman_biometric.csv'))

print(f'Enrollment: {len(enrol_df):,} | Demographic: {len(demo_df):,} | Biometric: {len(bio_df):,}')

Enrollment: 461 | Demographic: 1,822 | Biometric: 5,250


## 2. Data Cleaning

In [3]:
# Official 3 Districts
OFFICIAL_DISTRICTS = {
    'nicobars', 
    'north and middle andaman', 
    'south andaman'
}

DISTRICT_CLEANUP_MAP = {
    'nicobar': 'nicobars',
    'andamans': 'south andaman'
}

def clean_district_name(name):
    if pd.isna(name):
        return None
    
    # Lowercase and strip
    cleaned = str(name).strip().lower()
    
    # Remove asterisk
    if cleaned.endswith(' *'):
        cleaned = cleaned[:-2].strip()
    if cleaned.endswith('*'):
        cleaned = cleaned[:-1].strip()
    
    # Apply mapping
    if cleaned in DISTRICT_CLEANUP_MAP:
        cleaned = DISTRICT_CLEANUP_MAP[cleaned]
        
    return cleaned

def clean_state_name(name):
    return 'Andaman & Nicobar Islands'

print(f'Official districts: {len(OFFICIAL_DISTRICTS)}')

Official districts: 3


In [4]:
# Apply cleaning and VALIDATE check
for df in [enrol_df, demo_df, bio_df]:
    # 1. Clean District
    df['district_clean'] = df['district'].apply(clean_district_name)
    
    # 2. Clean State (Ensure uniformity)
    df['state'] = df['state'].apply(clean_state_name)
    
    # 3. Check for unknowns
    unknowns = df[~df['district_clean'].isin(OFFICIAL_DISTRICTS)]['district'].unique()
    if len(unknowns) > 0:
        print(f'⚠️ CRITICAL: The following districts are still NOT mapped to the official list:\n{unknowns}')
            
    # 4. Apply
    df.dropna(subset=['district_clean'], inplace=True)
    df['district'] = df['district_clean']
    df.drop(columns=['district_clean'], inplace=True)

    # 5. Dates
    df['date'] = pd.to_datetime(df['date'], dayfirst=True)
    df['month'] = df['date'].dt.month

# Verify
all_cleaned = set(enrol_df['district'].unique()) | set(demo_df['district'].unique()) | set(bio_df['district'].unique())
print(f'\nFinal Districts ({len(all_cleaned)}):')
print(sorted(all_cleaned))

if len(all_cleaned) != 3:
    print(f'⚠️ Warning: Expected 3 districts, found {len(all_cleaned)}.')
else:
    print('✅ Exactly 3 districts found!')


Final Districts (3):
['nicobars', 'north and middle andaman', 'south andaman']
✅ Exactly 3 districts found!


## 3. Aggregation & Metrics

In [5]:
# Aggregate
enrol_agg = enrol_df.groupby(['state', 'district', 'month'])[['age_0_5', 'age_5_17', 'age_18_greater']].sum().reset_index()
demo_agg = demo_df.groupby(['state', 'district', 'month'])[['demo_age_5_17', 'demo_age_17_']].sum().reset_index()
bio_agg = bio_df.groupby(['state', 'district', 'month'])[['bio_age_5_17', 'bio_age_17_']].sum().reset_index()

combined_df = enrol_agg.merge(demo_agg, on=['state', 'district', 'month'], how='outer') \
                       .merge(bio_agg, on=['state', 'district', 'month'], how='outer')
combined_df.fillna(0, inplace=True)

# Core metrics
combined_df['E'] = combined_df['age_0_5'] + combined_df['age_5_17'] + combined_df['age_18_greater']
combined_df['DU'] = combined_df['demo_age_5_17'] + combined_df['demo_age_17_']
combined_df['BU'] = combined_df['bio_age_5_17'] + combined_df['bio_age_17_']
combined_df['U'] = combined_df['DU'] + combined_df['BU']
combined_df['T'] = combined_df['E'] + combined_df['U']

print(f'Combined records: {len(combined_df)}')

Combined records: 33


In [6]:
# District-level aggregation
district_df = combined_df.groupby(['state', 'district']).agg(
    total_months=('month', 'count'),
    active_months=('T', lambda x: (x > 0).sum()),
    total_E=('E', 'sum'), total_DU=('DU', 'sum'), total_BU=('BU', 'sum'),
    total_U=('U', 'sum'), total_T=('T', 'sum'),
    avg_monthly_enrolment=('E', 'mean'),
    monthly_volatility=('T', lambda x: x.std(ddof=0) / x.mean() if x.mean() > 0 else 0),
    peak_load_ratio=('T', lambda x: x.max() / x.mean() if x.mean() > 0 else 0),
    sum_age_0_5=('age_0_5', 'sum'), sum_age_5_17=('age_5_17', 'sum')
).reset_index()

district_df['zero_months'] = district_df['total_months'] - district_df['active_months']
district_df['activity_ratio'] = district_df['active_months'] / district_df['total_months']
district_df['zero_month_ratio'] = district_df['zero_months'] / district_df['total_months']
district_df['biometric_burden'] = (district_df['total_BU'] / (district_df['total_BU'] + district_df['total_DU'])).fillna(0)
district_df['update_dominant'] = np.where(district_df['total_U'] > district_df['total_E'], 1, 0)
district_df['enrollment_update_balance'] = (district_df['total_E'] / (district_df['total_E'] + district_df['total_U'])).fillna(0)

print(f'Districts computed: {len(district_df)}')

Districts computed: 3


## 4. DEI Score Calculation

In [7]:
def normalize(x):
    min_val, max_val = x.min(), x.max()
    if max_val == min_val:
        return pd.Series([0.5] * len(x), index=x.index)
    return (x - min_val) / (max_val - min_val)

def inverse_normalize(x):
    return 1 - normalize(x)

scores_df = district_df.copy()

# DEI Components
scores_df['access'] = (scores_df['activity_ratio'] + normalize(scores_df['avg_monthly_enrolment'])) / 2
scores_df['responsiveness'] = normalize(scores_df['total_U'] / scores_df['total_T'])
scores_df['inclusion'] = normalize((scores_df['sum_age_0_5'] + scores_df['sum_age_5_17']) / scores_df['total_E'])
scores_df['stability'] = (inverse_normalize(scores_df['monthly_volatility']) + inverse_normalize(scores_df['peak_load_ratio'])) / 2
scores_df['visibility'] = scores_df['activity_ratio']

# Final scores
scores_df['DEI'] = (scores_df['access'] + scores_df['responsiveness'] + scores_df['inclusion'] + scores_df['stability'] + scores_df['visibility']) / 5
scores_df['ASS'] = (inverse_normalize(scores_df['activity_ratio']) + inverse_normalize(scores_df['avg_monthly_enrolment'])) / 2
scores_df['UBS'] = (normalize(scores_df['biometric_burden']) + normalize(scores_df['update_dominant'])) / 2
scores_df['SRS'] = (normalize(scores_df['monthly_volatility']) + normalize(scores_df['zero_month_ratio'])) / 2

print('DEI calculated!')
scores_df[['district', 'DEI', 'ASS', 'UBS', 'SRS']].sort_values('DEI', ascending=False)

DEI calculated!


Unnamed: 0,district,DEI,ASS,UBS,SRS
2,south andaman,0.819599,0.25,0.344708,0.324271
1,north and middle andaman,0.778896,0.561189,0.75,0.25
0,nicobars,0.4,0.75,0.25,0.75


In [9]:
print(f'=== Andaman and Nicobar Summary ===')
print(f'Districts: {len(scores_df)}')
print(f'Avg DEI: {scores_df["DEI"].mean():.4f}')
print(f'Best: {scores_df.loc[scores_df["DEI"].idxmax(), "district"]} ({scores_df["DEI"].max():.4f})')
print(f'Worst: {scores_df.loc[scores_df["DEI"].idxmin(), "district"]} ({scores_df["DEI"].min():.4f})')

=== Andaman and Nicobar Summary ===
Districts: 3
Avg DEI: 0.6662
Best: south andaman (0.8196)
Worst: nicobars (0.4000)


In [8]:
# Save outputs
scores_df.to_csv(os.path.join(BASE_PATH, 'andaman_district_analysis.csv'), index=False)
scores_df[['state', 'district', 'DEI', 'ASS', 'UBS', 'SRS']].to_csv(
    os.path.join(BASE_PATH, 'andaman_district_final_scores.csv'), index=False)
print('✅ Saved!')

✅ Saved!
