In [93]:
import os
import numpy as np
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt

In [94]:
pd.set_option("display.max_rows", 2000)
pd.set_option("display.max_columns", None)

In [95]:
base_path = r"D:\UIDAI hackathon\all_states"

enrol_dfs = []
enrol_dir = os.path.join(base_path, "api_data_aadhar_enrolment")
if os.path.exists(enrol_dir):
    for root, dirs, files in os.walk(enrol_dir):
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                enrol_dfs.append(pd.read_csv(file_path))
                print(f"Loaded: {file}")

demo_dfs = []
demo_dir = os.path.join(base_path, "api_data_aadhar_demographic")
if os.path.exists(demo_dir):
    for root, dirs, files in os.walk(demo_dir):
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                demo_dfs.append(pd.read_csv(file_path))
                print(f"Loaded: {file}")

bio_dfs = []
bio_dir = os.path.join(base_path, "api_data_aadhar_biometric")
if os.path.exists(bio_dir):
    for root, dirs, files in os.walk(bio_dir):
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(root, file)
                bio_dfs.append(pd.read_csv(file_path))
                print(f"Loaded: {file}")

# Concatenate all files from each folder
enrol_df = pd.concat(enrol_dfs, ignore_index=True) if enrol_dfs else pd.DataFrame()
demo_df = pd.concat(demo_dfs, ignore_index=True) if demo_dfs else pd.DataFrame()
bio_df = pd.concat(bio_dfs, ignore_index=True) if bio_dfs else pd.DataFrame()


Loaded: api_data_aadhar_enrolment_0_500000.csv
Loaded: api_data_aadhar_enrolment_1000000_1006029.csv
Loaded: api_data_aadhar_enrolment_500000_1000000.csv
Loaded: api_data_aadhar_demographic_0_500000.csv
Loaded: api_data_aadhar_demographic_1000000_1500000.csv
Loaded: api_data_aadhar_demographic_1500000_2000000.csv
Loaded: api_data_aadhar_demographic_2000000_2071700.csv
Loaded: api_data_aadhar_demographic_500000_1000000.csv
Loaded: api_data_aadhar_biometric_0_500000.csv
Loaded: api_data_aadhar_biometric_1000000_1500000.csv
Loaded: api_data_aadhar_biometric_1500000_1861108.csv
Loaded: api_data_aadhar_biometric_500000_1000000.csv


In [96]:
all_dfs = [enrol_df, demo_df, bio_df]

In [97]:
enrol_df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21


In [98]:
enrol_df["date"] = pd.to_datetime(enrol_df["date"], dayfirst=True)
demo_df["date"] = pd.to_datetime(demo_df["date"], dayfirst=True)
bio_df["date"] = pd.to_datetime(bio_df["date"], dayfirst=True)

In [99]:
print(enrol_df["state"].nunique(), demo_df["state"].nunique(), bio_df["state"].nunique())

55 65 57


In [100]:
enrol_df["state"].value_counts().sort_index()

state
100000                                              22
Andaman & Nicobar Islands                          103
Andaman and Nicobar Islands                        289
Andhra Pradesh                                   65658
Arunachal Pradesh                                 1601
Assam                                            31827
Bihar                                            60567
Chandigarh                                         859
Chhattisgarh                                     18550
Dadra & Nagar Haveli                                24
Dadra and Nagar Haveli                             162
Dadra and Nagar Haveli and Daman and Diu           116
Daman & Diu                                         20
Daman and Diu                                       92
Delhi                                             6804
Goa                                               1527
Gujarat                                          46624
Haryana                                          15997
Hima

In [101]:
bio_df["state"].value_counts().sort_index()

state
Andaman & Nicobar Islands                      549
Andaman and Nicobar Islands                   1298
Andhra Pradesh                              172034
Arunachal Pradesh                             4244
Assam                                        47643
Bihar                                        83398
Chandigarh                                    1656
Chhatisgarh                                      5
Chhattisgarh                                 31992
Dadra & Nagar Haveli                           100
Dadra and Nagar Haveli                         325
Dadra and Nagar Haveli and Daman and Diu       224
Daman & Diu                                    264
Daman and Diu                                  412
Delhi                                         9259
Goa                                           5428
Gujarat                                      89531
Haryana                                      26429
Himachal Pradesh                             30385
Jammu & Kashmir          

In [102]:
demo_df["state"].value_counts().sort_index()

state
100000                                           2
Andaman & Nicobar Islands                      513
Andaman and Nicobar Islands                   1211
Andhra Pradesh                              207687
Arunachal Pradesh                             4726
Assam                                        62834
BALANAGAR                                        2
Bihar                                        97621
Chandigarh                                    2044
Chhatisgarh                                      4
Chhattisgarh                                 35726
Dadra & Nagar Haveli                           100
Dadra and Nagar Haveli                         325
Dadra and Nagar Haveli and Daman and Diu       524
Daman & Diu                                    267
Daman and Diu                                  411
Darbhanga                                        2
Delhi                                        10510
Goa                                           5921
Gujarat                  

In [None]:
cleanup_map = {
    "west bengal": "west bengal", "west bengli": "west bengal", 
    "westbengal": "west bengal", "west bangal": "west bengal",
    "odisha": "odisha", "orissa": "odisha",
    "dadra and nagar haveli": "dadra and nagar haveli and daman and diu",
    "daman and diu": "dadra and nagar haveli and daman and diu",
    "dadra & nagar haveli": "dadra and nagar haveli and daman and diu",
    "daman & diu": "dadra and nagar haveli and daman and diu",
    "the dadra and nagar haveli and daman and diu": "dadra and nagar haveli and daman and diu",
    "andhra pradesh": "andhra pradesh",
    "jammu & kashmir": "jammu and kashmir",
    "jammu and kashmir": "jammu and kashmir",
    "andaman & nicobar islands": "andaman and nicobar islands",
    "andaman and nicobar islands": "andaman and nicobar islands",
    "pondicherry": "puducherry",
    "puducherry": "puducherry",
    "uttaranchal": "uttarakhand",
    "uttarakhand": "uttarakhand",
    "chhatisgarh": "chhattisgarh",
    "tamilnadu": "tamil nadu"
}

valid_states = [
    "andhra pradesh", "arunachal pradesh", "assam", "bihar", "chhattisgarh", 
    "goa", "gujarat", "haryana", "himachal pradesh", "jharkhand", "karnataka", 
    "kerala", "madhya pradesh", "maharashtra", "manipur", "meghalaya", "mizoram", 
    "nagaland", "odisha", "punjab", "rajasthan", "sikkim", "tamil nadu", 
    "telangana", "tripura", "uttar pradesh", "uttarakhand", "west bengal",
    "andaman and nicobar islands", "chandigarh", "dadra and nagar haveli and daman and diu", 
    "delhi", "jammu and kashmir", "ladakh", "lakshadweep", "puducherry"
]

for df in all_dfs:
    df["state"] = df["state"].astype(str).str.lower().str.strip()
    df["state"] = df["state"].replace(cleanup_map)
    df.drop(df[~df["state"].isin(valid_states)].index, inplace=True)

print(enrol_df["state"].nunique()) # This should now show 36

29 29 29


In [104]:
enrol_df["state"] = enrol_df["state"].str.lower()
demo_df["state"] = demo_df["state"].str.lower()
bio_df["state"] = bio_df["state"].str.lower()

In [105]:
print(enrol_df["district"].nunique(), demo_df["district"].nunique(), bio_df["district"].nunique())

789 781 779


In [106]:
for df in all_dfs:
    df["month"] = df["date"].dt.month

In [107]:
enrol_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 792759 entries, 0 to 1006028
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   date            792759 non-null  datetime64[ns]
 1   state           792759 non-null  object        
 2   district        792759 non-null  object        
 3   pincode         792759 non-null  int64         
 4   age_0_5         792759 non-null  int64         
 5   age_5_17        792759 non-null  int64         
 6   age_18_greater  792759 non-null  int64         
 7   month           792759 non-null  int32         
dtypes: datetime64[ns](1), int32(1), int64(4), object(2)
memory usage: 51.4+ MB


In [108]:
demo_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1537554 entries, 0 to 2071699
Data columns (total 7 columns):
 #   Column         Non-Null Count    Dtype         
---  ------         --------------    -----         
 0   date           1537554 non-null  datetime64[ns]
 1   state          1537554 non-null  object        
 2   district       1537554 non-null  object        
 3   pincode        1537554 non-null  int64         
 4   demo_age_5_17  1537554 non-null  int64         
 5   demo_age_17_   1537554 non-null  int64         
 6   month          1537554 non-null  int32         
dtypes: datetime64[ns](1), int32(1), int64(3), object(2)
memory usage: 88.0+ MB


In [109]:
bio_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1408041 entries, 0 to 1861107
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   date          1408041 non-null  datetime64[ns]
 1   state         1408041 non-null  object        
 2   district      1408041 non-null  object        
 3   pincode       1408041 non-null  int64         
 4   bio_age_5_17  1408041 non-null  int64         
 5   bio_age_17_   1408041 non-null  int64         
 6   month         1408041 non-null  int32         
dtypes: datetime64[ns](1), int32(1), int64(3), object(2)
memory usage: 80.6+ MB


In [110]:
enrol_df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,month
0,2025-03-02,meghalaya,East Khasi Hills,793121,11,61,37,3
1,2025-03-09,karnataka,Bengaluru Urban,560043,14,33,39,3
2,2025-03-09,uttar pradesh,Kanpur Nagar,208001,29,82,12,3
3,2025-03-09,uttar pradesh,Aligarh,202133,62,29,15,3
4,2025-03-09,karnataka,Bengaluru Urban,560016,14,16,21,3


In [111]:
demo_df.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_,month
0,2025-03-01,uttar pradesh,Gorakhpur,273213,49,529,3
2,2025-03-01,gujarat,Rajkot,360006,65,765,3
4,2025-03-01,rajasthan,Udaipur,313801,45,785,3
5,2025-03-01,rajasthan,Sikar,332028,28,285,3
6,2025-03-01,karnataka,Tumakuru,572201,88,332,3


In [112]:
bio_df.head()

Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,month
0,2025-03-01,haryana,Mahendragarh,123029,280,577,3
1,2025-03-01,bihar,Madhepura,852121,144,369,3
3,2025-03-01,bihar,Bhojpur,802158,256,980,3
4,2025-03-01,tamil nadu,Madurai,625514,271,815,3
5,2025-03-01,maharashtra,Ratnagiri,416702,155,529,3


In [113]:
enrol_agg = enrol_df.groupby(["state", "month"])[["age_0_5", "age_5_17", "age_18_greater"]].sum().reset_index()
demo_agg = demo_df.groupby(["state", "month"])[["demo_age_5_17", "demo_age_17_"]].sum().reset_index()
bio_agg = bio_df.groupby(["state", "month"])[["bio_age_5_17", "bio_age_17_"]].sum().reset_index()

combined_df = enrol_agg.merge(demo_agg, on = ["state", "month"], how = "left").merge(bio_agg, on = ["state", "month"], how = "left")
combined_df.fillna(0, inplace=True)
combined_df.head()

Unnamed: 0,state,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
0,arunachal pradesh,7,89,352,43,0.0,0.0,4119,4323
1,arunachal pradesh,9,886,882,6,1902.0,9133.0,4855,2885
2,arunachal pradesh,10,387,473,5,952.0,3883.0,4499,1793
3,arunachal pradesh,11,341,395,97,1135.0,5539.0,8558,2076
4,arunachal pradesh,12,254,134,0,942.0,5148.0,6132,1936


In [114]:
combined_df["E"] = combined_df["age_0_5"] + combined_df["age_5_17"] + combined_df["age_18_greater"]
combined_df["DU"] = combined_df["demo_age_5_17"] + combined_df["demo_age_17_"]
combined_df["BU"] = combined_df["bio_age_5_17"] + combined_df["bio_age_17_"]
combined_df["U"] = combined_df["DU"] + combined_df["BU"]
combined_df["T"] = combined_df["E"] + combined_df["U"]
combined_df.head(20)

Unnamed: 0,state,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T
0,arunachal pradesh,7,89,352,43,0.0,0.0,4119,4323,484,0.0,8442,8442.0,8926.0
1,arunachal pradesh,9,886,882,6,1902.0,9133.0,4855,2885,1774,11035.0,7740,18775.0,20549.0
2,arunachal pradesh,10,387,473,5,952.0,3883.0,4499,1793,865,4835.0,6292,11127.0,11992.0
3,arunachal pradesh,11,341,395,97,1135.0,5539.0,8558,2076,833,6674.0,10634,17308.0,18141.0
4,arunachal pradesh,12,254,134,0,942.0,5148.0,6132,1936,388,6090.0,8068,14158.0,14546.0
5,assam,3,587,1105,800,16692.0,185345.0,59101,33830,2492,202037.0,92931,294968.0,297460.0
6,assam,4,11931,9203,4520,348.0,4007.0,105295,34364,25654,4355.0,139659,144014.0,169668.0
7,assam,5,5170,5609,2418,176.0,1252.0,52863,30978,13197,1428.0,83841,85269.0,98466.0
8,assam,6,4878,3401,1455,195.0,2250.0,39419,32157,9734,2445.0,71576,74021.0,83755.0
9,assam,7,30769,11877,5227,4737.0,39711.0,51489,43683,47873,44448.0,95172,139620.0,187493.0


In [115]:
state_monthly_counts = combined_df.groupby(["state", "month"]).agg(total_months = ("month", "count"), active_months = ("T", lambda x : (x > 0).sum())).reset_index()
state_monthly_counts.head()

Unnamed: 0,state,month,total_months,active_months
0,arunachal pradesh,7,1,1
1,arunachal pradesh,9,1,1
2,arunachal pradesh,10,1,1
3,arunachal pradesh,11,1,1
4,arunachal pradesh,12,1,1


In [116]:
state_monthly_counts["zero_months"] = state_monthly_counts["total_months"] - state_monthly_counts["active_months"]
state_monthly_counts["activity_ratio"] = state_monthly_counts["active_months"] / state_monthly_counts["total_months"]
state_monthly_counts["zero_month_ratio"] = state_monthly_counts["zero_months"] / state_monthly_counts["total_months"]

combined_df = combined_df.merge(state_monthly_counts[["state", "month", "activity_ratio", "zero_month_ratio"]], on = ["state", "month"], how = "left")
combined_df.head()

Unnamed: 0,state,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio
0,arunachal pradesh,7,89,352,43,0.0,0.0,4119,4323,484,0.0,8442,8442.0,8926.0,1.0,0.0
1,arunachal pradesh,9,886,882,6,1902.0,9133.0,4855,2885,1774,11035.0,7740,18775.0,20549.0,1.0,0.0
2,arunachal pradesh,10,387,473,5,952.0,3883.0,4499,1793,865,4835.0,6292,11127.0,11992.0,1.0,0.0
3,arunachal pradesh,11,341,395,97,1135.0,5539.0,8558,2076,833,6674.0,10634,17308.0,18141.0,1.0,0.0
4,arunachal pradesh,12,254,134,0,942.0,5148.0,6132,1936,388,6090.0,8068,14158.0,14546.0,1.0,0.0


In [117]:
combined_df = combined_df.merge(state_monthly_counts[["state", "month", "activity_ratio", "zero_month_ratio"]], on = ["state", "month"], how = "left")
combined_df.head()

Unnamed: 0,state,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio_x,zero_month_ratio_x,activity_ratio_y,zero_month_ratio_y
0,arunachal pradesh,7,89,352,43,0.0,0.0,4119,4323,484,0.0,8442,8442.0,8926.0,1.0,0.0,1.0,0.0
1,arunachal pradesh,9,886,882,6,1902.0,9133.0,4855,2885,1774,11035.0,7740,18775.0,20549.0,1.0,0.0,1.0,0.0
2,arunachal pradesh,10,387,473,5,952.0,3883.0,4499,1793,865,4835.0,6292,11127.0,11992.0,1.0,0.0,1.0,0.0
3,arunachal pradesh,11,341,395,97,1135.0,5539.0,8558,2076,833,6674.0,10634,17308.0,18141.0,1.0,0.0,1.0,0.0
4,arunachal pradesh,12,254,134,0,942.0,5148.0,6132,1936,388,6090.0,8068,14158.0,14546.0,1.0,0.0,1.0,0.0


In [118]:
state_volume_metrics = combined_df.groupby(["state"]).agg(avg_monthly_enrolment = ("E", "mean"),
                                               monthly_valatility = ("T", lambda x: x.std(ddof=0) / x.mean() if x.mean() > 0 else 0),
                                               peak_load_ratio = ("T", lambda x: x.max() / x.mean() if x.mean() > 0 else 0)).reset_index()


combined_df = combined_df.merge(state_volume_metrics, on=["state"], how="left")
combined_df.head()

Unnamed: 0,state,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio_x,zero_month_ratio_x,activity_ratio_y,zero_month_ratio_y,avg_monthly_enrolment,monthly_valatility,peak_load_ratio
0,arunachal pradesh,7,89,352,43,0.0,0.0,4119,4323,484,0.0,8442,8442.0,8926.0,1.0,0.0,1.0,0.0,868.8,0.280714,1.385562
1,arunachal pradesh,9,886,882,6,1902.0,9133.0,4855,2885,1774,11035.0,7740,18775.0,20549.0,1.0,0.0,1.0,0.0,868.8,0.280714,1.385562
2,arunachal pradesh,10,387,473,5,952.0,3883.0,4499,1793,865,4835.0,6292,11127.0,11992.0,1.0,0.0,1.0,0.0,868.8,0.280714,1.385562
3,arunachal pradesh,11,341,395,97,1135.0,5539.0,8558,2076,833,6674.0,10634,17308.0,18141.0,1.0,0.0,1.0,0.0,868.8,0.280714,1.385562
4,arunachal pradesh,12,254,134,0,942.0,5148.0,6132,1936,388,6090.0,8068,14158.0,14546.0,1.0,0.0,1.0,0.0,868.8,0.280714,1.385562


In [119]:
state_update_burden = combined_df.groupby(["state"]).agg(avg_monthly_enrollments = ("E", "sum"), avg_monthly_demo_updates = ("DU", "sum"), avg_monthly_bio_updates = ("BU", "sum")).reset_index()

state_update_burden["U"] = state_update_burden["avg_monthly_demo_updates"] + state_update_burden["avg_monthly_bio_updates"]
state_update_burden["biometric_burden"] = state_update_burden["avg_monthly_bio_updates"] / (state_update_burden["avg_monthly_bio_updates"] + state_update_burden["avg_monthly_demo_updates"])
state_update_burden["update_dominant"] = np.where(state_update_burden["U"] > state_update_burden["avg_monthly_enrollments"], 1, 0)
state_update_burden["enrollment_update_balance"] = state_update_burden["avg_monthly_enrollments"] / (state_update_burden["avg_monthly_enrollments"] + state_update_burden["U"])

combined_df = combined_df.merge(state_update_burden[["state", "biometric_burden", "update_dominant", "enrollment_update_balance"]], on=["state"], how="left")
combined_df.head()

Unnamed: 0,state,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio_x,zero_month_ratio_x,activity_ratio_y,zero_month_ratio_y,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance
0,arunachal pradesh,7,89,352,43,0.0,0.0,4119,4323,484,0.0,8442,8442.0,8926.0,1.0,0.0,1.0,0.0,868.8,0.280714,1.385562,0.58983,1,0.058581
1,arunachal pradesh,9,886,882,6,1902.0,9133.0,4855,2885,1774,11035.0,7740,18775.0,20549.0,1.0,0.0,1.0,0.0,868.8,0.280714,1.385562,0.58983,1,0.058581
2,arunachal pradesh,10,387,473,5,952.0,3883.0,4499,1793,865,4835.0,6292,11127.0,11992.0,1.0,0.0,1.0,0.0,868.8,0.280714,1.385562,0.58983,1,0.058581
3,arunachal pradesh,11,341,395,97,1135.0,5539.0,8558,2076,833,6674.0,10634,17308.0,18141.0,1.0,0.0,1.0,0.0,868.8,0.280714,1.385562,0.58983,1,0.058581
4,arunachal pradesh,12,254,134,0,942.0,5148.0,6132,1936,388,6090.0,8068,14158.0,14546.0,1.0,0.0,1.0,0.0,868.8,0.280714,1.385562,0.58983,1,0.058581


In [120]:
combined_df.drop(["activity_ratio_y", "zero_month_ratio_y"], axis=1, inplace=True)
combined_df.rename(columns={"activity_ratio_x": "activity_ratio", "zero_month_ratio_x": "zero_month_ratio"}, inplace=True)

In [121]:
combined_df = combined_df.groupby(["state"], as_index = False).first()
combined_df.head()

Unnamed: 0,state,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance
0,arunachal pradesh,7,89,352,43,0.0,0.0,4119,4323,484,0.0,8442,8442.0,8926.0,1.0,0.0,868.8,0.280714,1.385562,0.58983,1,0.058581
1,assam,3,587,1105,800,16692.0,185345.0,59101,33830,2492,202037.0,92931,294968.0,297460.0,1.0,0.0,25577.444444,0.447755,1.711753,0.492518,1,0.103436
2,bihar,3,516,1392,444,95221.0,991478.0,324179,439330,2352,1086699.0,763509,1850208.0,1852560.0,1.0,0.0,67731.666667,0.43651,1.615366,0.504285,1,0.05906
3,chandigarh,4,86,21,10,3917.0,10968.0,4587,3334,117,14885.0,7921,22806.0,22923.0,1.0,0.0,544.6,0.391069,1.680868,0.362163,1,0.039934
4,chhattisgarh,3,115,46,99,18699.0,282767.0,37255,134137,260,301466.0,171392,472858.0,473118.0,1.0,0.0,11468.777778,0.365082,1.770646,0.56911,1,0.021697


In [122]:
def normalize(x):
    maxx, minx = x.max(), x.min()
    if maxx == minx:
        return x * 0 + 0.5
    normalized = (x - minx) / (maxx - minx)
    return normalized

def inverse_normalize(x):
    inversed = 1 - normalize(x)
    return inversed

In [123]:
combined_df["access"] = (combined_df["activity_ratio"] + normalize(combined_df["avg_monthly_enrolment"])) / 2
combined_df["responsiveness"] = normalize(combined_df["U"] / (combined_df["E"] + combined_df["U"]))
combined_df["inclusion"] = normalize((combined_df["age_0_5"] + combined_df["age_5_17"]) / combined_df["E"])
combined_df["stability"] = (inverse_normalize(combined_df["monthly_valatility"]) + inverse_normalize(combined_df["peak_load_ratio"])) / 2
combined_df["visibility"] = combined_df["activity_ratio"]

combined_df["DEI"] = (combined_df["access"] + combined_df["responsiveness"] + combined_df["inclusion"] + combined_df["stability"] + combined_df["visibility"]) / 5
combined_df["ASS"] = (inverse_normalize(combined_df["activity_ratio"]) + inverse_normalize(combined_df["avg_monthly_enrolment"])) / 2
combined_df["UBS"] = (normalize(combined_df["biometric_burden"]) + normalize(combined_df["update_dominant"])) / 2
combined_df["SRS"] = (normalize(combined_df["monthly_valatility"]) + normalize(combined_df["zero_month_ratio"])) / 2

combined_df.head()


Unnamed: 0,state,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance,access,responsiveness,inclusion,stability,visibility,DEI,ASS,UBS,SRS
0,arunachal pradesh,7,89,352,43,0.0,0.0,4119,4323,484,0.0,8442,8442.0,8926.0,1.0,0.0,868.8,0.280714,1.385562,0.58983,1,0.058581,0.503648,0.715935,0.822084,0.826649,1.0,0.773663,0.746352,0.600162,0.350767
1,assam,3,587,1105,800,16692.0,185345.0,59101,33830,2492,202037.0,92931,294968.0,297460.0,1.0,0.0,25577.444444,0.447755,1.711753,0.492518,1,0.103436,0.612846,0.956238,0.357116,0.631524,1.0,0.711545,0.637154,0.524034,0.461444
2,bihar,3,516,1392,444,95221.0,991478.0,324179,439330,2352,1086699.0,763509,1850208.0,1852560.0,1.0,0.0,67731.666667,0.43651,1.615366,0.504285,1,0.05906,0.799142,0.993495,0.621961,0.663928,1.0,0.815705,0.450858,0.53324,0.453994
3,chandigarh,4,86,21,10,3917.0,10968.0,4587,3334,117,14885.0,7921,22806.0,22923.0,1.0,0.0,544.6,0.391069,1.680868,0.362163,1,0.039934,0.502216,0.973396,0.828839,0.677079,1.0,0.796306,0.747784,0.422055,0.423885
4,chhattisgarh,3,115,46,99,18699.0,282767.0,37255,134137,260,301466.0,171392,472858.0,473118.0,1.0,0.0,11468.777778,0.365082,1.770646,0.56911,1,0.021697,0.550494,0.997269,0.237478,0.671055,1.0,0.691259,0.699506,0.583953,0.406667


In [124]:
combined_df.drop(["access", "responsiveness", "inclusion", "stability", "visibility"], axis=1, inplace=True)
combined_df.head()

Unnamed: 0,state,month,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,E,DU,BU,U,T,activity_ratio,zero_month_ratio,avg_monthly_enrolment,monthly_valatility,peak_load_ratio,biometric_burden,update_dominant,enrollment_update_balance,DEI,ASS,UBS,SRS
0,arunachal pradesh,7,89,352,43,0.0,0.0,4119,4323,484,0.0,8442,8442.0,8926.0,1.0,0.0,868.8,0.280714,1.385562,0.58983,1,0.058581,0.773663,0.746352,0.600162,0.350767
1,assam,3,587,1105,800,16692.0,185345.0,59101,33830,2492,202037.0,92931,294968.0,297460.0,1.0,0.0,25577.444444,0.447755,1.711753,0.492518,1,0.103436,0.711545,0.637154,0.524034,0.461444
2,bihar,3,516,1392,444,95221.0,991478.0,324179,439330,2352,1086699.0,763509,1850208.0,1852560.0,1.0,0.0,67731.666667,0.43651,1.615366,0.504285,1,0.05906,0.815705,0.450858,0.53324,0.453994
3,chandigarh,4,86,21,10,3917.0,10968.0,4587,3334,117,14885.0,7921,22806.0,22923.0,1.0,0.0,544.6,0.391069,1.680868,0.362163,1,0.039934,0.796306,0.747784,0.422055,0.423885
4,chhattisgarh,3,115,46,99,18699.0,282767.0,37255,134137,260,301466.0,171392,472858.0,473118.0,1.0,0.0,11468.777778,0.365082,1.770646,0.56911,1,0.021697,0.691259,0.699506,0.583953,0.406667


In [125]:
combined_df.to_csv(r"D:\UIDAI hackathon\all_states\all_states_analysis.csv", index=False)
final_df = combined_df[["state", "DEI", "ASS", "UBS", "SRS"]]
final_df.to_csv(r"D:\UIDAI hackathon\all_states\all_states_final_scores.csv", index=False)