In [None]:
import numpy as np
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt

In [13]:
pd.set_option("display.max_rows", 2000)
pd.set_option("display.max_columns", None)

In [None]:
enrol_df = pd.read_csv(r"D:\UIDAI hackathon\North\Ladakh\enrollment.csv")
demo_df = pd.read_csv(r"D:\UIDAI hackathon\North\Ladakh\demographic.csv")
bio_df = pd.read_csv(r"D:\UIDAI hackathon\North\Ladakh\biometric.csv")

In [15]:
all_dfs = [enrol_df, demo_df, bio_df]

In [16]:
enrol_df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,01-09-2025,Ladakh,Kargil,194103,8,1,0
1,02-09-2025,Ladakh,Kargil,194105,2,1,0
2,06-09-2025,Ladakh,Kargil,194102,1,0,0
3,04-09-2025,Ladakh,Kargil,194103,1,0,0
4,04-09-2025,Ladakh,Kargil,194105,1,1,0


In [17]:
enrol_df["date"] = pd.to_datetime(enrol_df["date"], dayfirst=True)
demo_df["date"] = pd.to_datetime(demo_df["date"], dayfirst=True)
bio_df["date"] = pd.to_datetime(bio_df["date"], dayfirst=True)

In [18]:
print(enrol_df["district"].nunique(), demo_df["district"].nunique(), bio_df["district"].nunique())

2 2 2


In [19]:
enrol_df["district"].value_counts().sort_index()

district
Kargil    264
Leh        99
Name: count, dtype: int64

In [20]:
bio_df["district"].value_counts().sort_index()

district
Kargil    1302
Leh       1045
Name: count, dtype: int64

In [21]:
demo_df["district"].value_counts().sort_index()

district
Kargil    553
Leh       471
Name: count, dtype: int64

In [22]:
cleanup_map = {}

for df in all_dfs:
    df["district"] = df["district"].replace(cleanup_map)
    
print(enrol_df["district"].nunique(), demo_df["district"].nunique(), bio_df["district"].nunique())

2 2 2


In [23]:
enrol_df["district"] = enrol_df["district"].str.lower()
demo_df["district"] = demo_df["district"].str.lower()
bio_df["district"] = bio_df["district"].str.lower()

In [24]:
print(enrol_df["pincode"].nunique(), demo_df["pincode"].nunique(), bio_df["pincode"].nunique())

13 15 15


In [25]:
# Analyze pincode differences
enrol_pincodes = set(enrol_df["pincode"].dropna())
demo_pincodes = set(demo_df["pincode"].dropna())
bio_pincodes = set(bio_df["pincode"].dropna())

print("Enrollment pincodes:", len(enrol_pincodes))
print("Demographic pincodes:", len(demo_pincodes))
print("Biometric pincodes:", len(bio_pincodes))
print()

# Find differences
only_in_enrol = enrol_pincodes - demo_pincodes - bio_pincodes
only_in_demo = demo_pincodes - enrol_pincodes - bio_pincodes
only_in_bio = bio_pincodes - enrol_pincodes - demo_pincodes
in_all = enrol_pincodes & demo_pincodes & bio_pincodes

print(f"Pincodes only in enrollment: {len(only_in_enrol)}")
print(f"Pincodes only in demographic: {len(only_in_demo)}")
print(f"Pincodes only in biometric: {len(only_in_bio)}")
print(f"Pincodes in all three: {len(in_all)}")
print()

# Check for NULL/NaN values
print("NULL pincodes:")
print(f"Enrollment: {enrol_df["pincode"].isna().sum()}")
print(f"Demographic: {demo_df["pincode"].isna().sum()}")
print(f"Biometric: {bio_df["pincode"].isna().sum()}")
print()

# Show some examples of unique pincodes
print("Sample pincodes only in enrollment:", list(only_in_enrol))
print("Sample pincodes only in demographic:", list(only_in_demo))
print("Sample pincodes only in biometric:", list(only_in_bio))

Enrollment pincodes: 13
Demographic pincodes: 15
Biometric pincodes: 15

Pincodes only in enrollment: 0
Pincodes only in demographic: 0
Pincodes only in biometric: 0
Pincodes in all three: 13

NULL pincodes:
Enrollment: 0
Demographic: 0
Biometric: 0

Sample pincodes only in enrollment: []
Sample pincodes only in demographic: []
Sample pincodes only in biometric: []


In [26]:
for df in all_dfs:
    df["month"] = df["date"].dt.month

In [27]:
enrol_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363 entries, 0 to 362
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            363 non-null    datetime64[ns]
 1   state           363 non-null    object        
 2   district        363 non-null    object        
 3   pincode         363 non-null    int64         
 4   age_0_5         363 non-null    int64         
 5   age_5_17        363 non-null    int64         
 6   age_18_greater  363 non-null    int64         
 7   month           363 non-null    int32         
dtypes: datetime64[ns](1), int32(1), int64(4), object(2)
memory usage: 21.4+ KB


In [28]:
demo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1024 entries, 0 to 1023
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           1024 non-null   datetime64[ns]
 1   state          1024 non-null   object        
 2   district       1024 non-null   object        
 3   pincode        1024 non-null   int64         
 4   demo_age_5_17  1024 non-null   int64         
 5   demo_age_17_   1024 non-null   int64         
 6   month          1024 non-null   int32         
dtypes: datetime64[ns](1), int32(1), int64(3), object(2)
memory usage: 52.1+ KB


In [29]:
bio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2347 entries, 0 to 2346
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          2347 non-null   datetime64[ns]
 1   state         2347 non-null   object        
 2   district      2347 non-null   object        
 3   pincode       2347 non-null   int64         
 4   bio_age_5_17  2347 non-null   int64         
 5   bio_age_17_   2347 non-null   int64         
 6   month         2347 non-null   int32         
dtypes: datetime64[ns](1), int32(1), int64(3), object(2)
memory usage: 119.3+ KB


In [30]:
enrol_df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,month
0,2025-09-01,Ladakh,kargil,194103,8,1,0,9
1,2025-09-02,Ladakh,kargil,194105,2,1,0,9
2,2025-09-06,Ladakh,kargil,194102,1,0,0,9
3,2025-09-04,Ladakh,kargil,194103,1,0,0,9
4,2025-09-04,Ladakh,kargil,194105,1,1,0,9


In [31]:
demo_df.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_,month
0,2025-03-01,Ladakh,kargil,194301,89,234,3
1,2025-03-01,Ladakh,kargil,194109,68,133,3
2,2025-09-01,Ladakh,leh,194104,0,1,9
3,2025-09-01,Ladakh,leh,194106,0,3,9
4,2025-09-02,Ladakh,kargil,194103,0,2,9


In [32]:
bio_df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,month
0,2025-03-01,Ladakh,leh,194106,33,42,3
1,2025-05-01,Ladakh,leh,194101,48,70,5
2,2025-03-01,Ladakh,leh,194101,78,101,3
3,2025-04-01,Ladakh,leh,194101,52,115,4
4,2025-04-01,Ladakh,leh,194401,17,40,4


In [33]:
enrol_agg = enrol_df.groupby(["state", "district", "month"])[["age_0_5", "age_5_17", "age_18_greater"]].sum().reset_index()
demo_agg = demo_df.groupby(["state", "district", "month"])[["demo_age_5_17", "demo_age_17_"]].sum().reset_index()
bio_agg = bio_df.groupby(["state", "district", "month"])[["bio_age_5_17", "bio_age_17_"]].sum().reset_index()

combined_df = enrol_agg.merge(demo_agg, on = ["state", "district", "month"], how = "left").merge(bio_agg, on = ["state", "district", "month"], how = "left")
combined_df.fillna(0, inplace=True)
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
0,Ladakh,kargil,1,13,4,0,6.0,38.0,18,12
1,Ladakh,kargil,6,18,26,1,0.0,0.0,89,314
2,Ladakh,kargil,9,101,44,0,45.0,204.0,609,230
3,Ladakh,kargil,10,45,37,4,49.0,131.0,315,194
4,Ladakh,kargil,11,100,28,3,143.0,427.0,272,210


In [34]:
combined_df["E"] = combined_df["age_0_5"] + combined_df["age_5_17"] + combined_df["age_18_greater"]
combined_df["DU"] = combined_df["demo_age_5_17"] + combined_df["demo_age_17_"]
combined_df["BU"] = combined_df["bio_age_5_17"] + combined_df["bio_age_17_"]
combined_df["U"] = combined_df["DU"] + combined_df["BU"]
combined_df["T"] = combined_df["E"] + combined_df["U"]
combined_df.head(20)

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T
0,Ladakh,kargil,1,13,4,0,6.0,38.0,18,12,17,44.0,30,74.0,91.0
1,Ladakh,kargil,6,18,26,1,0.0,0.0,89,314,45,0.0,403,403.0,448.0
2,Ladakh,kargil,9,101,44,0,45.0,204.0,609,230,145,249.0,839,1088.0,1233.0
3,Ladakh,kargil,10,45,37,4,49.0,131.0,315,194,86,180.0,509,689.0,775.0
4,Ladakh,kargil,11,100,28,3,143.0,427.0,272,210,131,570.0,482,1052.0,1183.0
5,Ladakh,kargil,12,126,12,1,208.0,625.0,237,224,139,833.0,461,1294.0,1433.0
6,Ladakh,leh,1,6,1,0,5.0,31.0,8,17,7,36.0,25,61.0,68.0
7,Ladakh,leh,6,6,1,0,0.0,0.0,141,475,7,0.0,616,616.0,623.0
8,Ladakh,leh,9,23,2,5,17.0,190.0,281,197,30,207.0,478,685.0,715.0
9,Ladakh,leh,10,24,7,2,34.0,121.0,368,183,33,155.0,551,706.0,739.0


In [35]:
district_monthly_counts = combined_df.groupby(["district", "month"]).agg(total_months = ("month", "count"), active_months = ("T", lambda x : (x > 0).sum())).reset_index()
district_monthly_counts.head()

Unnamed: 0,district,month,total_months,active_months
0,kargil,1,1,1
1,kargil,6,1,1
2,kargil,9,1,1
3,kargil,10,1,1
4,kargil,11,1,1


In [36]:
district_monthly_counts["zero_months"] = district_monthly_counts["total_months"] - district_monthly_counts["active_months"]
district_monthly_counts["activity_ratio"] = district_monthly_counts["active_months"] / district_monthly_counts["total_months"]
district_monthly_counts["zero_month_ratio"] = district_monthly_counts["zero_months"] / district_monthly_counts["total_months"]

combined_df = combined_df.merge(district_monthly_counts[["district", "month", "activity_ratio", "zero_month_ratio"]], on = ["district", "month"], how = "left")
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio
0,Ladakh,kargil,1,13,4,0,6.0,38.0,18,12,17,44.0,30,74.0,91.0,1.0,0.0
1,Ladakh,kargil,6,18,26,1,0.0,0.0,89,314,45,0.0,403,403.0,448.0,1.0,0.0
2,Ladakh,kargil,9,101,44,0,45.0,204.0,609,230,145,249.0,839,1088.0,1233.0,1.0,0.0
3,Ladakh,kargil,10,45,37,4,49.0,131.0,315,194,86,180.0,509,689.0,775.0,1.0,0.0
4,Ladakh,kargil,11,100,28,3,143.0,427.0,272,210,131,570.0,482,1052.0,1183.0,1.0,0.0


In [37]:
combined_df = combined_df.merge(district_monthly_counts[["district", "month", "activity_ratio", "zero_month_ratio"]], on = ["district", "month"], how = "left")
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio_x,zero_month_ratio_x,activity_ratio_y,zero_month_ratio_y
0,Ladakh,kargil,1,13,4,0,6.0,38.0,18,12,17,44.0,30,74.0,91.0,1.0,0.0,1.0,0.0
1,Ladakh,kargil,6,18,26,1,0.0,0.0,89,314,45,0.0,403,403.0,448.0,1.0,0.0,1.0,0.0
2,Ladakh,kargil,9,101,44,0,45.0,204.0,609,230,145,249.0,839,1088.0,1233.0,1.0,0.0,1.0,0.0
3,Ladakh,kargil,10,45,37,4,49.0,131.0,315,194,86,180.0,509,689.0,775.0,1.0,0.0,1.0,0.0
4,Ladakh,kargil,11,100,28,3,143.0,427.0,272,210,131,570.0,482,1052.0,1183.0,1.0,0.0,1.0,0.0


In [38]:
district_volume_metrics = combined_df.groupby(["state", "district"]).agg(avg_monthly_enrolment = ("E", "mean"),
                                               monthly_valatility = ("T", lambda x: x.std(ddof=0) / x.mean() if x.mean() > 0 else 0),
                                               peak_load_ratio = ("T", lambda x: x.max() / x.mean() if x.mean() > 0 else 0)).reset_index()


combined_df = combined_df.merge(district_volume_metrics, on=["state", "district"], how="left")
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio_x,zero_month_ratio_x,activity_ratio_y,zero_month_ratio_y,avg_monthly_enrolment,monthly_valatility,peak_load_ratio
0,Ladakh,kargil,1,13,4,0,6.0,38.0,18,12,17,44.0,30,74.0,91.0,1.0,0.0,1.0,0.0,93.833333,0.549218,1.665311
1,Ladakh,kargil,6,18,26,1,0.0,0.0,89,314,45,0.0,403,403.0,448.0,1.0,0.0,1.0,0.0,93.833333,0.549218,1.665311
2,Ladakh,kargil,9,101,44,0,45.0,204.0,609,230,145,249.0,839,1088.0,1233.0,1.0,0.0,1.0,0.0,93.833333,0.549218,1.665311
3,Ladakh,kargil,10,45,37,4,49.0,131.0,315,194,86,180.0,509,689.0,775.0,1.0,0.0,1.0,0.0,93.833333,0.549218,1.665311
4,Ladakh,kargil,11,100,28,3,143.0,427.0,272,210,131,570.0,482,1052.0,1183.0,1.0,0.0,1.0,0.0,93.833333,0.549218,1.665311


In [39]:
district_update_burden = combined_df.groupby(["state", "district"]).agg(avg_monthly_enrollments = ("E", "sum"), avg_monthly_demo_updates = ("DU", "sum"), avg_monthly_bio_updates = ("BU", "sum")).reset_index()

district_update_burden["U"] = district_update_burden["avg_monthly_demo_updates"] + district_update_burden["avg_monthly_bio_updates"]
district_update_burden["biometric_burden"] = district_update_burden["avg_monthly_bio_updates"] / (district_update_burden["avg_monthly_bio_updates"] + district_update_burden["avg_monthly_demo_updates"])
district_update_burden["update_dominant"] = np.where(district_update_burden["U"] > district_update_burden["avg_monthly_enrollments"], 1, 0)
district_update_burden["enrollment_update_balance"] = district_update_burden["avg_monthly_enrollments"] / (district_update_burden["avg_monthly_enrollments"] + district_update_burden["U"])

combined_df = combined_df.merge(district_update_burden[["state", "district", "biometric_burden", "update_dominant", "enrollment_update_balance"]], on=["state", "district"], how="left")
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio_x,zero_month_ratio_x,activity_ratio_y,zero_month_ratio_y,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance
0,Ladakh,kargil,1,13,4,0,6.0,38.0,18,12,17,44.0,30,74.0,91.0,1.0,0.0,1.0,0.0,93.833333,0.549218,1.665311,0.592174,1,0.109045
1,Ladakh,kargil,6,18,26,1,0.0,0.0,89,314,45,0.0,403,403.0,448.0,1.0,0.0,1.0,0.0,93.833333,0.549218,1.665311,0.592174,1,0.109045
2,Ladakh,kargil,9,101,44,0,45.0,204.0,609,230,145,249.0,839,1088.0,1233.0,1.0,0.0,1.0,0.0,93.833333,0.549218,1.665311,0.592174,1,0.109045
3,Ladakh,kargil,10,45,37,4,49.0,131.0,315,194,86,180.0,509,689.0,775.0,1.0,0.0,1.0,0.0,93.833333,0.549218,1.665311,0.592174,1,0.109045
4,Ladakh,kargil,11,100,28,3,143.0,427.0,272,210,131,570.0,482,1052.0,1183.0,1.0,0.0,1.0,0.0,93.833333,0.549218,1.665311,0.592174,1,0.109045


In [40]:
combined_df.drop(["activity_ratio_y", "zero_month_ratio_y"], axis=1, inplace=True)
combined_df.rename(columns={"activity_ratio_x": "activity_ratio", "zero_month_ratio_x": "zero_month_ratio"}, inplace=True)

In [41]:
combined_df = combined_df.groupby(["state", "district"], as_index = False).first()
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance
0,Ladakh,kargil,1,13,4,0,6.0,38.0,18,12,17,44.0,30,74.0,91.0,1.0,0.0,93.833333,0.549218,1.665311,0.592174,1,0.109045
1,Ladakh,leh,1,6,1,0,5.0,31.0,8,17,7,36.0,25,61.0,68.0,1.0,0.0,24.166667,0.554876,1.745074,0.561244,1,0.029762


In [42]:
def normalize(x):
    maxx, minx = x.max(), x.min()
    if maxx == minx:
        return x * 0 + 0.5
    normalized = (x - minx) / (maxx - minx)
    return normalized

def inverse_normalize(x):
    inversed = 1 - normalize(x)
    return inversed

In [43]:
combined_df["access"] = (combined_df["activity_ratio"] + normalize(combined_df["avg_monthly_enrolment"])) / 2
combined_df["responsiveness"] = normalize(combined_df["U"] / (combined_df["E"] + combined_df["U"]))
combined_df["inclusion"] = normalize((combined_df["age_0_5"] + combined_df["age_5_17"]) / combined_df["E"])
combined_df["stability"] = (inverse_normalize(combined_df["monthly_valatility"]) + inverse_normalize(combined_df["peak_load_ratio"])) / 2
combined_df["visibility"] = combined_df["activity_ratio"]

combined_df["DEI"] = (combined_df["access"] + combined_df["responsiveness"] + combined_df["inclusion"] + combined_df["stability"] + combined_df["visibility"]) / 5
combined_df["ASS"] = (inverse_normalize(combined_df["activity_ratio"]) + inverse_normalize(combined_df["avg_monthly_enrolment"])) / 2
combined_df["UBS"] = (normalize(combined_df["biometric_burden"]) + normalize(combined_df["update_dominant"])) / 2
combined_df["SRS"] = (normalize(combined_df["monthly_valatility"]) + normalize(combined_df["zero_month_ratio"])) / 2

combined_df.head()


Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance,access,responsiveness,inclusion,stability,visibility,DEI,ASS,UBS,SRS
0,Ladakh,kargil,1,13,4,0,6.0,38.0,18,12,17,44.0,30,74.0,91.0,1.0,0.0,93.833333,0.549218,1.665311,0.592174,1,0.109045,1.0,0.0,0.5,1.0,1.0,0.7,0.25,0.75,0.25
1,Ladakh,leh,1,6,1,0,5.0,31.0,8,17,7,36.0,25,61.0,68.0,1.0,0.0,24.166667,0.554876,1.745074,0.561244,1,0.029762,0.5,1.0,0.5,0.0,1.0,0.6,0.75,0.25,0.75


In [44]:
combined_df.drop(["access", "responsiveness", "inclusion", "stability", "visibility"], axis=1, inplace=True)
combined_df.head()

Unnamed: 0,state,district,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance,DEI,ASS,UBS,SRS
0,Ladakh,kargil,1,13,4,0,6.0,38.0,18,12,17,44.0,30,74.0,91.0,1.0,0.0,93.833333,0.549218,1.665311,0.592174,1,0.109045,0.7,0.25,0.75,0.25
1,Ladakh,leh,1,6,1,0,5.0,31.0,8,17,7,36.0,25,61.0,68.0,1.0,0.0,24.166667,0.554876,1.745074,0.561244,1,0.029762,0.6,0.75,0.25,0.75


In [None]:
combined_df.to_csv(r"D:\UIDAI hackathon\North\Ladakh\ladakh_district_analysis.csv", index=False)
final_df = combined_df[["state", "district", "DEI", "ASS", "UBS", "SRS"]]
final_df.to_csv(r"D:\UIDAI hackathon\North\Ladakh\ladakh_district_final_scores.csv", index=False)