In [1]:
import pandas as pd

pd.set_option("display.max_columns", 50)
import warnings

import janitor
from indicate import transliterate

from utilities.utils import get_fulldata

warnings.filterwarnings("ignore")

usecols = [
    "name_of_ryot",
    "residence",
    "district",
    "division",
    "caste",
    "mouza",
    "account_no",
    "6",
    "7",
    "8",
]
df = (
    get_fulldata(**{"usecols": usecols})
    .rename_columns(new_column_names={"6": "acres", "7": "decimals", "8": "hectare"})
    .assign(tt_area_acre=lambda df: df["acres"] + df["decimals"] / 100)
)
df.shape

  from pkg_resources import resource_filename
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
2026-01-03 11:22:21.719156: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-01-03 11:22:21.731761: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2026-01-03 11:22:21.827093: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-03 11:22:21.827170: E tensorflow/compiler/xla/stream_executor/cuda/c

(41871025, 11)

In [2]:
df.head(3)

Unnamed: 0,name_of_ryot,residence,caste,district,division,mouza,account_no,acres,decimals,hectare,tt_area_acre
0,कपिलदेव पाठक,"निजग्राम,,",ब्रहामण,वैशाली,वैशाली,चकपिताम्बर,1801010001000002,0.0,66.0,0.0,0.66
1,कपिलदेव पाठक,"निजग्राम,,",ब्रहामण,वैशाली,वैशाली,चकपिताम्बर,1801010001000002,0.0,8.0,0.0,0.08
2,कपिलदेव पाठक,"निजग्राम,,",ब्रहामण,वैशाली,वैशाली,चकपिताम्बर,1801010001000002,0.0,6.0,0.0,0.06


In [3]:
# 12,127,297 account holders
df.account_no.nunique()

12127297

## Land area

In [4]:
len(df.query("acres<0"))

0

In [5]:
len(df.query("decimals<0"))

14

In [6]:
len(df.query("tt_area_acre<0"))

14

In [7]:
len(df.query("tt_area_acre==0"))

3216631

In [8]:
(df.query("acres>=0").query("decimals>=0").query("tt_area_acre>0").shape)

(38654380, 11)

In [9]:
(df.query("tt_area_acre>0").shape)

(38654380, 11)

In [10]:
(df.query("tt_area_acre>0")["account_no"].nunique())

11931739

In [11]:
pd.set_option("display.float_format", "{:,.6f}".format)
(
    df.remove_columns(["account_no"])
    .query("tt_area_acre>0")
    .describe(percentiles=[0.25, 0.5, 0.75, 0.95, 0.99, 0.995, 0.999, 0.9999])
)

Unnamed: 0,acres,decimals,hectare,tt_area_acre
count,38654380.0,38654380.0,38654380.0,38654380.0
mean,0.634432,24.472577,5.936855,0.879158
std,1466.939384,2807.813505,781.182934,1467.210739
min,0.0,0.0,0.0,0.0
25%,0.0,6.0,0.0,0.06
50%,0.0,13.0,0.0,0.14
75%,0.0,29.0,0.0,0.325
95%,1.0,70.0,0.0324,1.13
99%,3.0,92.2,0.61,3.35
99.5%,5.0,96.0,49.0,5.4


In [12]:
len(df.query("tt_area_acre>100_00"))

91

In [23]:
100 * 91 / (len(df.query("tt_area_acre>0")))

0.00023541963420445496

In [22]:
df.query("tt_area_acre>0")["tt_area_acre"].quantile(0.9998)

74.0

In [14]:
df.query("tt_area_acre>100_00")["account_no"].nunique()

81

In [24]:
100 * (81 / df.query("tt_area_acre>0")["account_no"].nunique())

0.0006788616479123454

### Residences

In [4]:
df.residence.nunique()

830567

In [5]:
_df = df.head(100).drop_duplicates("residence", ignore_index=True)
for i in range(10):
    residence = _df.loc[i, "residence"]
    english_translated = transliterate.hindi2english(residence)
    print(f"{english_translated} ({residence})")

jamua (जमुआ,,,)
dahgama (दहगामा,,,)
pothiya (पोठीया,,,)
ramangar (रामनगर,,,)
 (,,,)
pothi (पोठीय,,,)
phulwari (फुलवारी,,,)
phulwari (फुलवाड़ी,,,)
belwari (बेलवारी,,,)
bailvari (बैलवारी,,,)


### Districts

In [6]:
for district in df.district.unique():
    english_translated = transliterate.hindi2english(district)
    print(f"{english_translated} ({district})")

araria (अररिया)
araval (अरवल)
aurangabad (औरंगाबाद)
katihar (कटिहार)
kishenganj (किशनगंज)
camor (कैमूर)
khagriya (खगड़िया)
gaya (गया)
gopalganj (गोपालगंज)
jamui (जमुई)
jahanabad (जहानाबाद)
darbhanga (दरभंगा)
navada (नवादा)
nalanda (नालंदा)
patna (पटना)
pashchim champaran (पश्चिम चंपारण)
purniyaaayaaon (पूर्णियॉं)
purvi champaran (पूर्वी चम्पारण)
bucksre (बक्सर)
banka (बांका)
begusaray (बेगुसराए)
bhagalpur (भागलपुर)
bhojpur (भोजपुर)
madhubani (मधुबनी)
madhepura (मधेपुरा)
munger (मुंगेर)
muzfarpur (मुज़फ्फरपुर)
rohtas (रोहतास)
lakhisarai (लखीसराय)
vaishali (वैशाली)
shivahar (शिवहर)
shekhpura (शेखपुरा)
samastipur (समस्तीपुर)
saharsa (सहरसा)
saran (सारण)
sivan (सिवान)
sitamadhi (सीतामढ़ी)
supaul (सुपौल)


### Divisions

In [7]:
df.division.nunique()

505

In [8]:
_df = df.drop_duplicates("division", ignore_index=True)
for i in range(10):
    division = _df.loc[i, "division"]
    english_translated = transliterate.hindi2english(division)
    print(f"{english_translated} ({division})")

araria (अररिया)
jokihat (जोकीहाट)
palasi (पलासी)
sikati (सिकटी)
kursakanta (कुर्साकांटा)
foribisgangen (फारिबिसगंज)
raniganj (रानीगंज)
narpatganj (नरपतगंज)
bhargama (भरगामा)
araval (अरवल)


### Mouzas

In [9]:
df.mouza.nunique()

26052

In [10]:
_df = df.drop_duplicates("mouza", ignore_index=True)
for i in range(10):
    mouza = _df.loc[i, "mouza"]
    english_translated = transliterate.hindi2english(mouza)
    print(f"{english_translated} ({mouza})")

phulwari (फुलवाड़ी)
kismat jamua (किस्मत जमुआ)
gilhwari (गिलहवाड़ी)
khamgamaga (खमगढ़ा)
jamua (जमुआ)
sharanpur (शरणपुर)
damada jagir (दमड़ा जागीर)
dabhara (दभड़ा)
tarabadi (ताराबाड़ी)
jitwarpur (जितवारपुर)


### Castes

In [11]:
# castes (top 25)
df_castes = (
    df.groupby("caste")
    .size()
    .reset_index()
    .rename_column(0, "count")
    .sort_values("count", ascending=False)
    .reset_index(drop=True)
)

df_top25_castes = df_castes.head(25).assign(
    caste_eng=lambda df: df.caste.apply(transliterate.hindi2english)
)
print(f"Number of unique castes (from raw/uncleaned record) = {len(df_castes):,}")
df_top25_castes

Number of unique castes (from raw/uncleaned record) = 76,517


Unnamed: 0,caste,count,caste_eng
0,यादव,4500844,yadavas
1,राजपुत,2151764,rajput
2,ब्राह्मण,1293878,brahmins
3,कुर्मी,1225727,kurmi
4,शेख,1039915,shekh
5,तेली,919487,teli
6,मुसलमान,818864,muslims
7,भूमिहार,780578,bhumihar
8,भुमिहार,753912,bhumihar
9,राजपूत,730700,rajput


In [14]:
# import winsound
# frequency = 2500  # Set Frequency To 2500 Hertz
# duration = 1000  # Set Duration To 1000 ms == 1 second
# winsound.Beep(frequency, duration)