<a href="https://colab.research.google.com/github/itsdakshjain/Data-Hackathon/blob/main/census%20data/Census_Model_predicting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression


In [None]:
def fit_population_model(df_state, value_col):
    df_state = df_state.dropna(subset=[value_col])

    years = df_state["Census_Year"].values.reshape(-1, 1)
    years2 = years ** 2
    X = np.hstack([years, years2])

    y = np.log(df_state[value_col].values)

    model = LinearRegression()
    model.fit(X, y)

    return model


In [None]:
def predict_population(model, years):
    years = np.array(years).reshape(-1, 1)
    years2 = years ** 2
    X = np.hstack([years, years2])

    log_pred = model.predict(X)
    return np.exp(log_pred)


In [None]:
df = pd.read_csv("census_clean.csv")

num_cols = ["Persons", "Males", "Females"]

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")


df["Census_Year"] = df["Census_Year"].astype(int)

states = df["Region"].unique()

models = {}

for state in states:
    df_state = df[df["Region"] == state]

    models[state] = {
        "Persons": fit_population_model(df_state, "Persons"),
        "Males": fit_population_model(df_state, "Males"),
        "Females": fit_population_model(df_state, "Females")
    }


In [None]:
models

{'INDIA': {'Persons': LinearRegression(),
  'Males': LinearRegression(),
  'Females': LinearRegression()},
 'Jammu & Kashmir': {'Persons': LinearRegression(),
  'Males': LinearRegression(),
  'Females': LinearRegression()},
 'Himachal Pradesh': {'Persons': LinearRegression(),
  'Males': LinearRegression(),
  'Females': LinearRegression()},
 'Punjab': {'Persons': LinearRegression(),
  'Males': LinearRegression(),
  'Females': LinearRegression()},
 'Chandigarh': {'Persons': LinearRegression(),
  'Males': LinearRegression(),
  'Females': LinearRegression()},
 'Uttarakhand': {'Persons': LinearRegression(),
  'Males': LinearRegression(),
  'Females': LinearRegression()},
 'Haryana': {'Persons': LinearRegression(),
  'Males': LinearRegression(),
  'Females': LinearRegression()},
 'NCT OF Delhi': {'Persons': LinearRegression(),
  'Males': LinearRegression(),
  'Females': LinearRegression()},
 'Rajasthan': {'Persons': LinearRegression(),
  'Males': LinearRegression(),
  'Females': LinearRegres

In [None]:
for state in states:
    mask = df["Region"] == state
    years = df.loc[mask, "Census_Year"]

In [None]:
years

420    1901
421    1911
422    1921
423    1931
424    1941
425    1951
426    1961
427    1971
428    1981
429    1991
430    2001
431    2011
Name: Census_Year, dtype: int64

In [None]:
for state in states:
    mask = df["Region"] == state
    years = df.loc[mask, "Census_Year"]

    for col in ["Persons", "Males", "Females"]:
        missing = df.loc[mask, col].isna()
        if missing.any():
            preds = predict_population(models[state][col], years[missing])
            df.loc[mask & missing, col] = preds.astype(int)


In [None]:
target_years = np.arange(2007, 2026)


In [None]:
rows = []

for state in states:
    for year in target_years:
        row = {
            "Region": state,
            "Census_Year": year
        }
        for col in ["Persons", "Males", "Females"]:
            val = predict_population(models[state][col], [year])[0]
            row[col] = int(val)
        rows.append(row)


In [None]:
df_annual = pd.DataFrame(rows)


In [None]:
df_annual.describe

<bound method NDFrame.describe of                         Region  Census_Year     Persons      Males    Females
0                        INDIA         2007  1180401246  609793777  570637982
1                        INDIA         2008  1211539922  625810003  585762351
2                        INDIA         2009  1243746004  642370388  601409908
3                        INDIA         2010  1277060777  659495784  617601081
4                        INDIA         2011  1311527295  677207930  634357182
..                         ...          ...         ...        ...        ...
679  Andaman & Nicobar Islands         2021      974713     505039     458926
680  Andaman & Nicobar Islands         2022     1033260     534266     486649
681  Andaman & Nicobar Islands         2023     1095811     565448     516228
682  Andaman & Nicobar Islands         2024     1162665     598729     547797
683  Andaman & Nicobar Islands         2025     1234146     634264     581501

[684 rows x 5 columns]>

In [None]:
df_annual.to_csv("census_annual_2007_onwards.csv", index=False)
print("✅ Annual population dataset created")


✅ Annual population dataset created


In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure()
