In [2]:
import pandas as pd

df = pd.read_csv(r"C:\Users\basav\Downloads\cleaned_biometric.csv")

print(df.shape)
print(df.columns)
df.head()

(1861108, 7)
Index(['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_',
       'total_biometric_enrollment'],
      dtype='object')


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_,total_biometric_enrollment
0,01-03-2025,haryana,Mahendragarh,123029,280,577,857
1,01-03-2025,bihar,Madhepura,852121,144,369,513
2,01-03-2025,jammu and kashmir,Punch,185101,643,1091,1734
3,01-03-2025,bihar,Bhojpur,802158,256,980,1236
4,01-03-2025,tamil nadu,Madurai,625514,271,815,1086


In [4]:
df["date"] = pd.to_datetime(df["date"], dayfirst=True)

In [5]:
df["Year"] = df["date"].dt.year
df["Month"] = df["date"].dt.month

In [6]:
# Convert date column
df["date"] = pd.to_datetime(df["date"], dayfirst=True)

# Extract features
df["Year"] = df["date"].dt.year
df["Month"] = df["date"].dt.month

# Sort for time-series correctness
df = df.sort_values(
    by=["state", "district", "date"]
).reset_index(drop=True)

In [8]:
df = df.sort_values(["state", "district", "date"])

df["rolling_24m_avg"] = (
    df.groupby(["state", "district"])["total_biometric_enrollment"]
      .transform(lambda x: x.rolling(24, min_periods=1).mean())
)

In [10]:
df = df.sort_values(["state", "district", "date"])

df["growth_rate"] = (
    df.groupby(["state", "district"])["total_biometric_enrollment"]
      .pct_change()
      .fillna(0)
)

In [12]:
df["biometric_update_risk"] = (
    df["total_biometric_enrollment"] < df["rolling_24m_avg"]
).astype(int)

In [15]:
state_summary = df.groupby("state")["total_biometric_enrollment"].sum().sort_values()
state_summary


state
lakshadweep                                    4820
ladakh                                         5763
andaman and nicobar islands                   20698
sikkim                                        22820
dadra and nagar haveli and daman and diu      39268
goa                                           68397
puducherry                                    69908
arunachal pradesh                             72394
chandigarh                                    74482
meghalaya                                     87626
nagaland                                     109593
mizoram                                      120329
manipur                                      282587
tripura                                      292155
himachal pradesh                             396234
uttarakhand                                  764765
jammu and kashmir                            791647
assam                                        982722
delhi                                       1304362
kerala

In [18]:
age_summary = df[["bio_age_5_17", "bio_age_17_"]].mean()
age_summary

bio_age_5_17    18.390580
bio_age_17_     19.094131
dtype: float64

In [19]:
month_summary = df.groupby("Month")["total_biometric_enrollment"].mean()
month_summary


Month
3     379.092698
4     400.022173
5     358.391595
6     359.205539
7     441.722766
9      15.964956
10     13.815037
11     15.869589
12     16.007687
Name: total_biometric_enrollment, dtype: float64

In [21]:
from sklearn.ensemble import IsolationForest

features = df[["total_biometric_enrollment","rolling_24m_avg","growth_rate"]]

iso = IsolationForest(contamination=0.02, random_state=42)
df["anomaly"] = iso.fit_predict(features)

df["anomaly"] = df["anomaly"].map({1:0, -1:1})


In [25]:
import numpy as np

df["growth_rate"] = (
    df["growth_rate"]
    .replace([np.inf, -np.inf], np.nan)
    .fillna(0)
)

In [27]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
# Clean features
df["growth_rate"] = (
    df["growth_rate"]
    .replace([np.inf, -np.inf], np.nan)
    .fillna(0)
    .clip(-1, 1)
)
X = df[["rolling_24m_avg", "growth_rate"]]
y = df["total_biometric_enrollment"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

preds = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, preds))

MAE: 20.305823200279303


In [31]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import numpy as np
monthly = df.groupby("date")["total_biometric_enrollment"].sum().values
# Create sequences
X, y = [], []
for i in range(len(monthly)-12):
    X.append(monthly[i:i+12])
    y.append(monthly[i+12])
X, y = np.array(X), np.array(y)
X = X.reshape(X.shape[0], X.shape[1], 1)
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(12,1)),
    LSTM(32),
    Dense(1)
])
model.compile(optimizer="adam", loss="mse")
model.fit(X, y, epochs=20, batch_size=32)

Epoch 1/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 26ms/step - loss: 130264989696.0000
Epoch 2/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 130264678400.0000 
Epoch 3/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 130264293376.0000
Epoch 4/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 130264064000.0000
Epoch 5/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - loss: 130263851008.0000 
Epoch 6/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 130263646208.0000
Epoch 7/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 130263449600.0000 
Epoch 8/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 130263220224.0000
Epoch 9/20
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 130262933504.0000
Epoch 10/20
[1m

<keras.src.callbacks.history.History at 0x27f1ef85e50>