In [1]:
import re
import pandas as pd

pd.set_option("display.max_columns", 50)
import janitor
import sidetable
from utilities.utils import get_fulldata

usecols = ["district", "division", "mouza"]
geo_slugs = ["zilla_district_lr", "anchal_block_lr", "mauja_village_lr"]


# Get Bihar land record data
df = (
    get_fulldata(**{"usecols": usecols})
    # ======================================================================
    # Renaming to the three geoslugs for consistency
    .rename_column("district", "zilla_district_lr")
    .rename_column("division", "anchal_block_lr")
    .rename_column("mouza", "mauja_village_lr")
    # ======================================================================
    # Make sure no whitespaces screw up the merge
    .assign(**{col: lambda df, k=col: df[k].str.strip() for col in geo_slugs})
    # ======================================================================
    # Get unique villages by the three geoslugs
    .drop_duplicates(geo_slugs)
    .reset_index(drop=True)
)
print(df.shape)
df.head()



(35628, 3)


Unnamed: 0,zilla_district_lr,anchal_block_lr,mauja_village_lr
0,अररिया,अररिया,फुलवाड़ी
1,अररिया,अररिया,किस्मत जमुआ
2,अररिया,अररिया,गिलहवाड़ी
3,अररिया,अररिया,खमगढ़ा
4,अररिया,अररिया,जमुआ


In [2]:
# Tabulation of villages by 38 Bihar districts
df.stb.freq(["zilla_district_lr"])

Unnamed: 0,zilla_district_lr,count,percent,cumulative_count,cumulative_percent
0,गया,2833,7.951611,2833,7.951611
1,रोहतास,1905,5.346918,4738,13.298529
2,मुज़फ्फरपुर,1737,4.875379,6475,18.173908
3,बांका,1673,4.695745,8148,22.869653
4,भागलपुर,1642,4.608735,9790,27.478388
5,कैमूर,1618,4.541372,11408,32.01976
6,वैशाली,1503,4.218592,12911,36.238352
7,गोपालगंज,1472,4.131582,14383,40.369934
8,जमुई,1452,4.075446,15835,44.44538
9,औरंगाबाद,1439,4.038958,17274,48.484338


In [5]:
# # Tabulation of villages by > 500 blocks
# # Will give 505 rows
# pd.set_option("display.max_rows", 505)
# df.stb.freq(["anchal_block_lr"])

In [6]:
# Get crosswalk data (from Aaditya)
df_xwalk = (
    pd.read_stata("../data/br_lr_census_crosswalk.dta")
    # ======================================================================
    # Remove the codes in brackets (e.g. जहानाबाद [001]' --> 'जहानाबाद')
    .assign(
        anchal_block_lr=lambda df: df["anchal_block_lr"].apply(
            lambda x: re.sub(r"\[.*?\]", "", x)
        )
    )
    # Make sure no whitespaces screw up the merge
    .assign(**{col: lambda df, k=col: df[k].str.strip() for col in geo_slugs})
    # ======================================================================
    # village is not the lowest geographical unit
    .drop_duplicates(geo_slugs)
    .reset_index(drop=True)
)
print(df_xwalk.shape)
df_xwalk.head()

(40957, 22)


Unnamed: 0,zilla_district_lr,zilla_district_lr_code,anumandal_subdiv_lr,anumandal_subdiv_lr_code,anchal_block_lr,anchal_circle_lr_code,mauja_village_lr,revenue_station_no_lr,total_kasara_lr,total_accounts_lr,statecode_census,statename_census,districtcode_census,districtname_census,subdistrictcode_census,subdistrictname_census,cdblockcode_census,cdblockname_census,grampanchayatcode_census,grampanchayatname_census,villagecode_census,villagename_census
0,जहानाबाद,33,जहानाबाद,1,जहानाबाद,1,अंजनी,407,235,182,10,BIHAR,239,Jehanabad,1536,Jehanabad,524,Jehanabad,3,Mandevigaha,260211,Anjani
1,जहानाबाद,33,जहानाबाद,1,जहानाबाद,1,अंजनीचक,405,27,19,10,BIHAR,239,Jehanabad,1536,Jehanabad,524,Jehanabad,3,Mandevigaha,260209,Anjanichak
2,जहानाबाद,33,जहानाबाद,1,जहानाबाद,1,अदलुचक,381,92,82,10,BIHAR,239,Jehanabad,1536,Jehanabad,524,Jehanabad,10,Pandui,260196,Adluchak
3,जहानाबाद,33,जहानाबाद,1,जहानाबाद,1,अमैन,366,765,614,10,BIHAR,239,Jehanabad,1536,Jehanabad,524,Jehanabad,11,Amain,260182,Amain
4,जहानाबाद,33,जहानाबाद,1,जहानाबाद,1,अरसठ,28,14,7,10,BIHAR,239,Jehanabad,1536,Jehanabad,524,Jehanabad,1,Surungapur Bhavanichak,260142,Arsath


In [7]:
# Attempt merge between Bihar data and the crosswalk
df_merged = (
    df.merge(df_xwalk, how="outer", on=geo_slugs, validate="1:1", indicator=True)
    .assign(_merge=lambda df_: df_["_merge"].map({"both": "both", "right_only": "Crosswalk only (file from Aaditya)", "left_only": "Bihar LR data only"}))
)
df_merged.head()

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


Unnamed: 0,zilla_district_lr,anchal_block_lr,mauja_village_lr,zilla_district_lr_code,anumandal_subdiv_lr,anumandal_subdiv_lr_code,anchal_circle_lr_code,revenue_station_no_lr,total_kasara_lr,total_accounts_lr,statecode_census,statename_census,districtcode_census,districtname_census,subdistrictcode_census,subdistrictname_census,cdblockcode_census,cdblockname_census,grampanchayatcode_census,grampanchayatname_census,villagecode_census,villagename_census,_merge
0,अररिया,अररिया,फुलवाड़ी,7.0,अररिया,1.0,1.0,168,1147.0,828.0,10,BIHAR,209,Araria,1116,Araria,104,Araria,2,Jamua,221808,Phulbari,both
1,अररिया,अररिया,किस्मत जमुआ,7.0,अररिया,1.0,1.0,169,394.0,309.0,10,BIHAR,209,Araria,1116,Araria,104,Araria,2,Jamua,221809,Kismat Jamua,both
2,अररिया,अररिया,गिलहवाड़ी,7.0,अररिया,1.0,1.0,170,704.0,383.0,10,BIHAR,209,Araria,1116,Araria,104,Araria,5,Kismat Khawaspur,221810,Gelhabari,both
3,अररिया,अररिया,खमगढ़ा,7.0,अररिया,1.0,1.0,171,1450.0,802.0,10,BIHAR,209,Araria,1116,Araria,104,Araria,2,Jamua,221811,Khamgara,both
4,अररिया,अररिया,जमुआ,7.0,अररिया,1.0,1.0,172,2498.0,1829.0,10,BIHAR,209,Araria,1116,Araria,104,Araria,2,Jamua,221812,Jamua,both


In [8]:
df_merged.stb.freq(["_merge"], style=True)

Unnamed: 0,_merge,count,percent,cumulative_count,cumulative_percent
0,both,27720,56.73%,27720,56.73%
1,Crosswalk only (file from Aaditya),13237,27.09%,40957,83.82%
2,Bihar LR data only,7908,16.18%,48865,100.00%


In [9]:
# 7.9k (district, block, village) tuples appear only in the Bihar land record table (left table)
df_merged.query("_merge=='Bihar LR data only'")

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type([values.dtype, comps_array.dtype], [])


Unnamed: 0,zilla_district_lr,anchal_block_lr,mauja_village_lr,zilla_district_lr_code,anumandal_subdiv_lr,anumandal_subdiv_lr_code,anchal_circle_lr_code,revenue_station_no_lr,total_kasara_lr,total_accounts_lr,statecode_census,statename_census,districtcode_census,districtname_census,subdistrictcode_census,subdistrictname_census,cdblockcode_census,cdblockname_census,grampanchayatcode_census,grampanchayatname_census,villagecode_census,villagename_census,_merge
1421,औरंगाबाद,कुटुम्‍बा,खैरा नोनिया बिगहा,,,,,,,,,,,,,,,,,,,,Bihar LR data only
3978,कैमूर,भभुआ,करमी चक,,,,,,,,,,,,,,,,,,,,Bihar LR data only
3980,कैमूर,भभुआ,खैरा खूर्द,,,,,,,,,,,,,,,,,,,,Bihar LR data only
3987,कैमूर,भभुआ,औरंईया देव,,,,,,,,,,,,,,,,,,,,Bihar LR data only
6021,गया,नगर,नगरपालिका वार्ड न0-2,,,,,,,,,,,,,,,,,,,,Bihar LR data only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33508,समस्तीपुर,हसनपुर,नरपतपुर नगर उर्फ चन्‍द्रपुर,,,,,,,,,,,,,,,,,,,,Bihar LR data only
33661,समस्तीपुर,सिंघिया,खराजी लक्षमनियॉ,,,,,,,,,,,,,,,,,,,,Bihar LR data only
33963,सारण,छपरा,बाजित पुर,,,,,,,,,,,,,,,,,,,,Bihar LR data only
34024,सारण,बनियापुर,छतवॉं खुर्द,,,,,,,,,,,,,,,,,,,,Bihar LR data only


In [10]:
"नगरपालीका वार्ड0 10" in df_xwalk.mauja_village_lr.unique()

False

In [11]:
"नगरपालीका वार्ड0" in df_xwalk.mauja_village_lr.unique()

False

In [12]:
"नगरपालीका वार्ड" in df_xwalk.mauja_village_lr.unique()

False