In [1]:
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', None)

In [None]:
df_enrolment = pd.read_csv('enrolment.csv')
df_demographic = pd.read_csv('demographic.csv')
df_biometric = pd.read_csv('biometric.csv')

print('Enrolment shape:', df_enrolment.shape)
print('Demographic shape:', df_demographic.shape)
print('Biometric shape:', df_biometric.shape)

Enrolment shape: (1965, 7)
Demographic shape: (5151, 6)
Biometric shape: (12749, 6)


In [None]:
for df in [df_enrolment, df_demographic, df_biometric]:
    df['date'] = pd.to_datetime(df['date'], dayfirst=True, errors='coerce')
    df['month'] = df['date'].dt.month

In [5]:
print('Enrolment districts:', df_enrolment['district'].nunique())
print('Demographic districts:', df_demographic['district'].nunique())
print('Biometric districts:', df_biometric['district'].nunique())

unique_districts = sorted(set(df_enrolment['district'].astype(str).unique()) | 
                          set(df_demographic['district'].astype(str).unique()) | 
                          set(df_biometric['district'].astype(str).unique()))

print('\nAll unique districts:')
print(unique_districts)

Enrolment districts: 25
Demographic districts: 25
Biometric districts: 25

All unique districts:
['Anjaw', 'Changlang', 'Dibang Valley', 'East Kameng', 'East Siang', 'Kamle', 'Kra Daadi', 'Kurung Kumey', 'Leparada', 'Lohit', 'Longding', 'Lower Dibang Valley', 'Lower Siang', 'Lower Subansiri', 'Namsai', 'Pakke Kessang', 'Papum Pare', 'Shi-yomi', 'Siang', 'Tawang', 'Tirap', 'Upper Siang', 'Upper Subansiri', 'West Kameng', 'West Siang']


In [None]:
def clean_district(name):
    if pd.isna(name):
        return name
    return str(name).lower().strip()

for df in [df_enrolment, df_demographic, df_biometric]:
    df['district'] = df['district'].apply(clean_district)

canonical_map = {
    'kra daadi': 'kra daadi',
    'kurung kumey': 'kurung kumey',
    'upper subansiri': 'upper subansiri',
    'west siang': 'west siang',
    'east siang': 'east siang',
    'siang': 'siang',
    'upper siang': 'upper siang',
    'lower siang': 'lower siang',
    'lower dibang valley': 'lower dibang valley',
    'dibang valley': 'dibang valley',
    'anjaw': 'anjaw',
    'lohit': 'lohit',
    'namsai': 'namsai',
    'changlang': 'changlang',
    'tirap': 'tirap',
    'longding': 'longding',
    'west kameng': 'west kameng',
    'east kameng': 'east kameng',
    'papum pare': 'papum pare',
    'pakke kessang': 'pakke kessang',
    'leparada': 'leparada',
    'shi yomi': 'shi yomi',
    'shi-yomi': 'shi yomi',
    'kamle': 'kamle',
    'tawang': 'tawang',
    'lower subansiri': 'lower subansiri',
    'itanagar capital complex': 'itanagar capital complex',
    'icc': 'itanagar capital complex'
}

for df in [df_enrolment, df_demographic, df_biometric]:
    df['district'] = df['district'].replace(canonical_map)

print('\nAfter cleanup:')
unique_cleaned = sorted(set(df_enrolment['district'].unique()) | 
                        set(df_demographic['district'].unique()) | 
                        set(df_biometric['district'].unique()))
print(unique_cleaned)
print(f"Total cleaned districts: {len(unique_cleaned)}")


After cleanup:
['anjaw', 'changlang', 'dibang valley', 'east kameng', 'east siang', 'kamle', 'kra daadi', 'kurung kumey', 'leparada', 'lohit', 'longding', 'lower dibang valley', 'lower siang', 'lower subansiri', 'namsai', 'pakke kessang', 'papum pare', 'shi yomi', 'siang', 'tawang', 'tirap', 'upper siang', 'upper subansiri', 'west kameng', 'west siang']
Total cleaned districts: 25


In [None]:
enrol_agg = df_enrolment.groupby(["state", "district", "month"])[["age_0_5", "age_5_17", "age_18_greater"]].sum().reset_index()
demo_agg = df_demographic.groupby(["state", "district", "month"])[["demo_age_5_17", "demo_age_17_"]].sum().reset_index()
bio_agg = df_biometric.groupby(["state", "district", "month"])[["bio_age_5_17", "bio_age_17_"]].sum().reset_index()

combined_df = enrol_agg.merge(demo_agg, on = ["state", "district", "month"], how = "left").merge(bio_agg, on = ["state", "district", "month"], how = "left")
combined_df.fillna(0, inplace=True)

combined_df["E"] = combined_df["age_0_5"] + combined_df["age_5_17"] + combined_df["age_18_greater"]
combined_df["DU"] = combined_df["demo_age_5_17"] + combined_df["demo_age_17_"]
combined_df["BU"] = combined_df["bio_age_5_17"] + combined_df["bio_age_17_"]
combined_df["U"] = combined_df["DU"] + combined_df["BU"]
combined_df["T"] = combined_df["E"] + combined_df["U"]

print("Combined shape:", combined_df.shape)
combined_df.head()

Combined shape: (142, 15)


Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T
0,Arunachal Pradesh,anjaw,6,3,18,0,0.0,0.0,94,137,21,0.0,231,231.0,252.0
1,Arunachal Pradesh,anjaw,9,16,5,0,29.0,150.0,61,65,21,179.0,126,305.0,326.0
2,Arunachal Pradesh,anjaw,10,1,2,0,22.0,52.0,91,79,3,74.0,170,244.0,247.0
3,Arunachal Pradesh,anjaw,11,2,4,3,26.0,135.0,281,83,9,161.0,364,525.0,534.0
4,Arunachal Pradesh,anjaw,12,3,0,0,17.0,116.0,65,64,3,133.0,129,262.0,265.0


In [None]:
district_monthly_counts = combined_df.groupby(["district", "month"]).agg(
    total_months = ("month", "count"), 
    active_months = ("T", lambda x : (x > 0).sum())
).reset_index()

district_monthly_counts["zero_months"] = district_monthly_counts["total_months"] - district_monthly_counts["active_months"]
district_monthly_counts["activity_ratio"] = district_monthly_counts["active_months"] / district_monthly_counts["total_months"]
district_monthly_counts["zero_month_ratio"] = district_monthly_counts["zero_months"] / district_monthly_counts["total_months"]

combined_df = combined_df.merge(district_monthly_counts[["district", "month", "activity_ratio", "zero_month_ratio"]], on = ["district", "month"], how = "left")

In [None]:
district_volume_metrics = combined_df.groupby(["state", "district"]).agg(
    avg_monthly_enrolment = ("E", "mean"),
    monthly_valatility = ("T", lambda x: x.std(ddof=0) / x.mean() if x.mean() > 0 else 0),
    peak_load_ratio = ("T", lambda x: x.max() / x.mean() if x.mean() > 0 else 0)
).reset_index()

combined_df = combined_df.merge(district_volume_metrics, on=["state", "district"], how="left")

In [None]:
district_update_burden = combined_df.groupby(["state", "district"]).agg(
    avg_monthly_enrollments = ("E", "sum"), 
    avg_monthly_demo_updates = ("DU", "sum"), 
    avg_monthly_bio_updates = ("BU", "sum")
).reset_index()
                                                                    
district_update_burden["U"] = district_update_burden["avg_monthly_demo_updates"] + district_update_burden["avg_monthly_bio_updates"]
district_update_burden["biometric_burden"] = district_update_burden["avg_monthly_bio_updates"] / (district_update_burden["avg_monthly_bio_updates"] + district_update_burden["avg_monthly_demo_updates"])
district_update_burden["update_dominant"] = np.where(district_update_burden["U"] > district_update_burden["avg_monthly_enrollments"], 1, 0)
district_update_burden["enrollment_update_balance"] = district_update_burden["avg_monthly_enrollments"] / (district_update_burden["avg_monthly_enrollments"] + district_update_burden["U"])

combined_df = combined_df.merge(district_update_burden[["state", "district", "biometric_burden", "update_dominant", "enrollment_update_balance"]], on=["state", "district"], how="left")

cols_to_drop = [c for c in combined_df.columns if c.endswith('_y')]
if cols_to_drop:
    combined_df.drop(cols_to_drop, axis=1, inplace=True)
    combined_df.rename(columns={c: c.replace('_x', '') for c in combined_df.columns if c.endswith('_x')}, inplace=True)

print("Final columns:", combined_df.columns.tolist())

Final columns: ['state', 'district', 'month', 'age_0_5', 'age_5_17', 'age_18_greater', 'demo_age_5_17', 'demo_age_17_', 'bio_age_5_17', 'bio_age_17_', 'E', 'DU', 'BU', 'U', 'T', 'activity_ratio', 'zero_month_ratio', 'avg_monthly_enrolment', 'monthly_valatility', 'peak_load_ratio', 'biometric_burden', 'update_dominant', 'enrollment_update_balance']


In [None]:
# Collapse to one row per district for final scoring
combined_df = combined_df.groupby(["state", "district"], as_index = False).first()

def normalize(x):
    maxx, minx = x.max(), x.min()
    if maxx == minx:
        return x * 0 + 0.5
    normalized = (x - minx) / (maxx - minx)
    return normalized

def inverse_normalize(x):
    inversed = 1 - normalize(x)
    return inversed

combined_df["access"] = (combined_df["activity_ratio"] + normalize(combined_df["avg_monthly_enrolment"])) / 2
combined_df["responsiveness"] = normalize(combined_df["U"] / (combined_df["E"] + combined_df["U"]))
combined_df["inclusion"] = normalize((combined_df["age_0_5"] + combined_df["age_5_17"]) / combined_df["E"])
combined_df["stability"] = (inverse_normalize(combined_df["monthly_valatility"]) + inverse_normalize(combined_df["peak_load_ratio"])) / 2
combined_df["visibility"] = combined_df["activity_ratio"]

combined_df["DEI"] = (combined_df["access"] + combined_df["responsiveness"] + combined_df["inclusion"] + combined_df["stability"] + combined_df["visibility"]) / 5
combined_df["ASS"] = (inverse_normalize(combined_df["activity_ratio"]) + inverse_normalize(combined_df["avg_monthly_enrolment"])) / 2
combined_df["UBS"] = (normalize(combined_df["biometric_burden"]) + normalize(combined_df["update_dominant"])) / 2
combined_df["SRS"] = (normalize(combined_df["monthly_valatility"]) + normalize(combined_df["zero_month_ratio"])) / 2

combined_df.drop(["access", "responsiveness", "inclusion", "stability", "visibility"], axis=1, inplace=True)
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance,DEI,ASS,UBS,SRS
0,Arunachal Pradesh,anjaw,6,3,18,0,0.0,0.0,94,137,21,0.0,231,231.0,252.0,1.0,0.0,11.4,0.333579,1.644089,0.650925,1,0.035099,0.820798,0.717158,0.512937,0.334298
1,Arunachal Pradesh,changlang,1,6,2,0,5.0,23.0,35,31,8,28.0,66,94.0,102.0,1.0,0.0,159.333333,0.641314,2.023048,0.620763,1,0.074945,0.859919,0.25,0.4734,0.517389
2,Arunachal Pradesh,dibang valley,1,0,2,0,1.0,2.0,0,1,2,3.0,1,4.0,6.0,1.0,0.0,3.333333,0.765638,2.570162,0.770167,1,0.029542,0.559201,0.742632,0.669239,0.591357
3,Arunachal Pradesh,east kameng,1,1,3,0,3.0,6.0,18,5,4,9.0,23,32.0,36.0,1.0,0.0,44.833333,0.671679,2.160224,0.659064,1,0.050233,0.7563,0.611579,0.523606,0.535456
4,Arunachal Pradesh,east siang,1,0,3,0,0.0,14.0,15,9,3,14.0,24,38.0,41.0,1.0,0.0,38.666667,0.551209,1.598158,0.593694,1,0.044521,0.820912,0.631053,0.437918,0.46378


In [None]:
print(f"Total districts: {len(combined_df)}")
print(f"\nScore summary statistics:")
for col in ['DEI', 'ASS', 'UBS', 'SRS']:
    print(f"  {col}: min={combined_df[col].min():.4f}, max={combined_df[col].max():.4f}, mean={combined_df[col].mean():.4f}")

for col in ['DEI', 'ASS', 'UBS', 'SRS']:
    if combined_df[col].isnull().any():
        print(f"WARNING: NaN values found in {col}")
        combined_df[col].fillna(0, inplace=True)
    
    assert combined_df[col].min() >= -0.01, f"{col} has negative values"
    assert combined_df[col].max() <= 1.01, f"{col} exceeds 1"
print("\n✓ All scores in valid [0, 1] range")

Total districts: 25

Score summary statistics:
  DEI: min=0.5592, max=0.8826, mean=0.7900
  ASS: min=0.2500, max=0.7500, mean=0.6413
  UBS: min=0.2500, max=0.7500, mean=0.5628
  SRS: min=0.2500, max=0.7500, mean=0.4551

✓ All scores in valid [0, 1] range


In [13]:
combined_df.to_csv("arunachalpradesh_district_analysis.csv", index=False)
final_df = combined_df[["state", "district", "DEI", "ASS", "UBS", "SRS"]]
final_df.to_csv("arunachalpradesh_district_final_scores.csv", index=False)
print("✓ Saved arunachalpradesh_district_analysis.csv")
print("✓ Saved arunachalpradesh_district_final_scores.csv")

✓ Saved arunachalpradesh_district_analysis.csv
✓ Saved arunachalpradesh_district_final_scores.csv
