In [1]:
import pandas as pd
pd.set_option('display.max_columns', 50)
import janitor
from utilities.utils import get_fulldata
from indicate import transliterate

usecols = ['name_of_ryot', 'residence', 'district', 'division', 'caste', 'mouza', 'account_no', '6', '7', '8']
df = (get_fulldata(**{'usecols':usecols})
      .assign(tt_area_acre=lambda df: df['6']+df['7']/100+df['8']*2.4711)
     )
df.shape

(41871025, 11)

In [2]:
df.head(3)

Unnamed: 0,name_of_ryot,residence,caste,district,division,mouza,account_no,6,7,8,tt_area_acre
0,मुसमात देवकी ओझा,"जमुआ,,,",,अररिया,अररिया,फुलवाड़ी,701010001000008,8.0,52.5,0.0,8.525
1,म.तजमुल हुसैन,"दहगामा,,,",,अररिया,अररिया,फुलवाड़ी,701010001000003,0.0,56.0,0.0,0.56
2,म.तजमुल हुसैन,"दहगामा,,,",,अररिया,अररिया,फुलवाड़ी,701010001000003,0.0,0.0,0.0,0.0


In [3]:
# 12,127,297 account holders
df.account_no.nunique()

12127297

### Residences

In [4]:
df.residence.nunique()

830567

In [5]:
_df = df.head(100).drop_duplicates("residence", ignore_index=True)
for i in range(10):
    residence = _df.loc[i, "residence"]
    english_translated = transliterate.hindi2english(residence)
    print(f"{english_translated} ({residence})")

jamua (जमुआ,,,)
dahgama (दहगामा,,,)
pothiya (पोठीया,,,)
ramangar (रामनगर,,,)
 (,,,)
pothi (पोठीय,,,)
phulwari (फुलवारी,,,)
phulwari (फुलवाड़ी,,,)
belwari (बेलवारी,,,)
bailvari (बैलवारी,,,)


### Districts

In [6]:
for district in df.district.unique():
    english_translated = transliterate.hindi2english(district)
    print(f"{english_translated} ({district})")

araria (अररिया)
araval (अरवल)
aurangabad (औरंगाबाद)
katihar (कटिहार)
kishenganj (किशनगंज)
camor (कैमूर)
khagriya (खगड़िया)
gaya (गया)
gopalganj (गोपालगंज)
jamui (जमुई)
jahanabad (जहानाबाद)
darbhanga (दरभंगा)
navada (नवादा)
nalanda (नालंदा)
patna (पटना)
pashchim champaran (पश्चिम चंपारण)
purniyaaayaaon (पूर्णियॉं)
purvi champaran (पूर्वी चम्पारण)
bucksre (बक्सर)
banka (बांका)
begusaray (बेगुसराए)
bhagalpur (भागलपुर)
bhojpur (भोजपुर)
madhubani (मधुबनी)
madhepura (मधेपुरा)
munger (मुंगेर)
muzfarpur (मुज़फ्फरपुर)
rohtas (रोहतास)
lakhisarai (लखीसराय)
vaishali (वैशाली)
shivahar (शिवहर)
shekhpura (शेखपुरा)
samastipur (समस्तीपुर)
saharsa (सहरसा)
saran (सारण)
sivan (सिवान)
sitamadhi (सीतामढ़ी)
supaul (सुपौल)


### Divisions

In [7]:
df.division.nunique()

505

In [8]:
_df = df.drop_duplicates("division", ignore_index=True)
for i in range(10):
    division = _df.loc[i, "division"]
    english_translated = transliterate.hindi2english(division)
    print(f"{english_translated} ({division})")

araria (अररिया)
jokihat (जोकीहाट)
palasi (पलासी)
sikati (सिकटी)
kursakanta (कुर्साकांटा)
foribisgangen (फारिबिसगंज)
raniganj (रानीगंज)
narpatganj (नरपतगंज)
bhargama (भरगामा)
araval (अरवल)


### Mouzas

In [9]:
df.mouza.nunique()

26052

In [10]:
_df = df.drop_duplicates("mouza", ignore_index=True)
for i in range(10):
    mouza = _df.loc[i, "mouza"]
    english_translated = transliterate.hindi2english(mouza)
    print(f"{english_translated} ({mouza})")

phulwari (फुलवाड़ी)
kismat jamua (किस्मत जमुआ)
gilhwari (गिलहवाड़ी)
khamgamaga (खमगढ़ा)
jamua (जमुआ)
sharanpur (शरणपुर)
damada jagir (दमड़ा जागीर)
dabhara (दभड़ा)
tarabadi (ताराबाड़ी)
jitwarpur (जितवारपुर)


### Castes

In [11]:
# castes (top 25)
df_castes = (df
             .groupby('caste').size().reset_index()
             .rename_column(0, "count")
             .sort_values("count", ascending=False)
             .reset_index(drop=True)
            )

df_top25_castes = (df_castes
                   .head(25)
                   .assign(caste_eng=lambda df: df.caste.apply(transliterate.hindi2english))
                  )
print(f"Number of unique castes (from raw/uncleaned record) = {len(df_castes):,}")
df_top25_castes

Number of unique castes (from raw/uncleaned record) = 76,517


Unnamed: 0,caste,count,caste_eng
0,यादव,4500844,yadavas
1,राजपुत,2151764,rajput
2,ब्राह्मण,1293878,brahmins
3,कुर्मी,1225727,kurmi
4,शेख,1039915,shekh
5,तेली,919487,teli
6,मुसलमान,818864,muslims
7,भूमिहार,780578,bhumihar
8,भुमिहार,753912,bhumihar
9,राजपूत,730700,rajput


### Zero & negative land areas

In [12]:
# 2,973,115 records with non-positive land areas
len(df.query("tt_area_acre<=0"))

2973115

In [13]:
# 95,553 account holders with non-positive land areas
len(df.groupby("account_no")["tt_area_acre"].sum().reset_index().query("tt_area_acre<=0"))

95553

In [14]:
# import winsound
# frequency = 2500  # Set Frequency To 2500 Hertz
# duration = 1000  # Set Duration To 1000 ms == 1 second
# winsound.Beep(frequency, duration)