In [119]:
import re
import pandas as pd
pd.set_option('display.max_columns', 50)
import janitor
from utilities.utils import get_fulldata

usecols = ["district", "division", "mouza"]
geo_slugs = ['zilla_district_lr', 'anchal_block_lr', 'mauja_village_lr']


# Get Bihar land record data
df = (get_fulldata(**{'usecols':usecols})
      .rename_column("district", "zilla_district_lr")
      .rename_column("division", "anchal_block_lr")
      .rename_column("mouza", "mauja_village_lr")
      # Make sure no whitespaces screw up the merge
      .assign(
          **{col: lambda df, k=col: df[k].str.strip() for col in geo_slugs}
      )  
      .drop_duplicates(geo_slugs)
      .reset_index(drop=True)
     )
df.shape
df

Unnamed: 0,zilla_district_lr,anchal_block_lr,mauja_village_lr
0,वैशाली,वैशाली,चकपिताम्बर
1,वैशाली,वैशाली,सिमरा
2,वैशाली,वैशाली,फुलाढ़
3,वैशाली,वैशाली,मंसूरपुर
4,वैशाली,वैशाली,चकना मारूफ उर्फ़ चकनथुआ
...,...,...,...
35623,मुज़फ्फरपुर,सरैया,बैघा टोला
35624,मुज़फ्फरपुर,सरैया,आनन्दपुर गंगोलिया
35625,मुज़फ्फरपुर,सरैया,विशुनपुर दुवियाही
35626,मुज़फ्फरपुर,सरैया,शादीक पुर


In [115]:
# Get crosswalk data (from Aaditya)
df_xwalk = (
    pd.read_stata("../data/br_lr_census_crosswalk.dta")
    .assign(anchal_block_lr=lambda df: df["anchal_block_lr"].apply(lambda x: re.sub(r"\[.*?\]", "", x)))
#     .assign(anchal_block_lr=lambda df: df["anchal_block_lr"].str.strip())
#       Make sure no whitespaces screw up the merge
      .assign(
          **{col: lambda df, k=col: df[k].str.strip() for col in geo_slugs}
      )    
    # village is not the lowest geographical unit
    .drop_duplicates(geo_slugs)
    .reset_index(drop=True)
)
df_xwalk.shape
df_xwalk

Unnamed: 0,zilla_district_lr,zilla_district_lr_code,anumandal_subdiv_lr,anumandal_subdiv_lr_code,anchal_block_lr,anchal_circle_lr_code,mauja_village_lr,revenue_station_no_lr,total_kasara_lr,total_accounts_lr,statecode_census,statename_census,districtcode_census,districtname_census,subdistrictcode_census,subdistrictname_census,cdblockcode_census,cdblockname_census,grampanchayatcode_census,grampanchayatname_census,villagecode_census,villagename_census
0,जहानाबाद,33,जहानाबाद,1,जहानाबाद,1,अंजनी,407,235,182,10,BIHAR,239,Jehanabad,01536,Jehanabad,0524,Jehanabad,0003,Mandevigaha,260211,Anjani
1,जहानाबाद,33,जहानाबाद,1,जहानाबाद,1,अंजनीचक,405,27,19,10,BIHAR,239,Jehanabad,01536,Jehanabad,0524,Jehanabad,0003,Mandevigaha,260209,Anjanichak
2,जहानाबाद,33,जहानाबाद,1,जहानाबाद,1,अदलुचक,381,92,82,10,BIHAR,239,Jehanabad,01536,Jehanabad,0524,Jehanabad,0010,Pandui,260196,Adluchak
3,जहानाबाद,33,जहानाबाद,1,जहानाबाद,1,अमैन,366,765,614,10,BIHAR,239,Jehanabad,01536,Jehanabad,0524,Jehanabad,0011,Amain,260182,Amain
4,जहानाबाद,33,जहानाबाद,1,जहानाबाद,1,अरसठ,28,14,7,10,BIHAR,239,Jehanabad,01536,Jehanabad,0524,Jehanabad,0001,Surungapur Bhavanichak,260142,Arsath
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40952,सहरसा,12,सिमरी बख्तियारपुर,2,बनमा ईटहारी,10,शमसुद्दीनपुर,94,1681,266,10,BIHAR,214,Saharsa,01180,Banma Itahri,0168,Banma Itahri,0005,MAHARAS,226694,Shamsuddinpur
40953,सहरसा,12,सिमरी बख्तियारपुर,2,बनमा ईटहारी,10,सुगमा,132,790,145,10,BIHAR,214,Saharsa,01180,Banma Itahri,0168,Banma Itahri,0002,SAHURIA,226705,Sugma
40954,सहरसा,12,सिमरी बख्तियारपुर,2,बनमा ईटहारी,10,सर्वेला,89,0,0,10,BIHAR,214,Saharsa,01180,Banma Itahri,0168,Banma Itahri,0006,SARBELA,226690,Sarbela
40955,सहरसा,12,सिमरी बख्तियारपुर,2,बनमा ईटहारी,10,सहुरिया,130,0,0,10,BIHAR,214,Saharsa,01180,Banma Itahri,0168,Banma Itahri,"0001, 0002","ITAHRI, SAHURIA",226704,Sahuria


In [122]:
# Attempt merge between Bihar data and the crosswalk
df_merged = (
    df.merge(df_xwalk, how="left", on=geo_slugs, validate="1:1", indicator=True)
)
df_merge

Unnamed: 0,zilla_district_lr,anchal_block_lr,mauja_village_lr,zilla_district_lr_code,anumandal_subdiv_lr,anumandal_subdiv_lr_code,anchal_circle_lr_code,revenue_station_no_lr,total_kasara_lr,total_accounts_lr,statecode_census,statename_census,districtcode_census,districtname_census,subdistrictcode_census,subdistrictname_census,cdblockcode_census,cdblockname_census,grampanchayatcode_census,grampanchayatname_census,villagecode_census,villagename_census,_merge
0,वैशाली,वैशाली,चकपिताम्बर,18.0,हाजीपुर,1.0,1.0,1,472.0,107.0,10,BIHAR,220,Vaishali,01268,Vaishali,0256,Vaishali,1,Fuladh,234576,ChakPitamber,both
1,वैशाली,वैशाली,सिमरा,18.0,हाजीपुर,1.0,1.0,2,1577.0,171.0,10,BIHAR,220,Vaishali,01268,Vaishali,0256,Vaishali,2,Chakalhadad,234577,Simra,both
2,वैशाली,वैशाली,फुलाढ़,18.0,हाजीपुर,1.0,1.0,3,3331.0,404.0,10,BIHAR,220,Vaishali,01268,Vaishali,0256,Vaishali,1,Fuladh,234578,Phular,both
3,वैशाली,वैशाली,मंसूरपुर,18.0,हाजीपुर,1.0,1.0,4,1677.0,300.0,10,BIHAR,220,Vaishali,01268,Vaishali,0256,Vaishali,1,Fuladh,234579,Mansurpur,both
4,वैशाली,वैशाली,चकना मारूफ उर्फ़ चकनथुआ,18.0,हाजीपुर,1.0,1.0,5,834.0,94.0,10,BIHAR,220,Vaishali,01268,Vaishali,0256,Vaishali,1,Fuladh,234580,Chakna Maruf Chak Nathua,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35623,मुज़फ्फरपुर,सरैया,बैघा टोला,,,,,,,,,,,,,,,,,,,,left_only
35624,मुज़फ्फरपुर,सरैया,आनन्दपुर गंगोलिया,,,,,,,,,,,,,,,,,,,,left_only
35625,मुज़फ्फरपुर,सरैया,विशुनपुर दुवियाही,,,,,,,,,,,,,,,,,,,,left_only
35626,मुज़फ्फरपुर,सरैया,शादीक पुर,,,,,,,,,,,,,,,,,,,,left_only


In [126]:
df_merged["_merge"].describe()

count     35628
unique        2
top        both
freq      27720
Name: _merge, dtype: object

In [128]:
# No (district, block, village) tuple appear only in the crosswalk table (right table)
df_merged.query("_merge=='right_only'")

Unnamed: 0,zilla_district_lr,anchal_block_lr,mauja_village_lr,zilla_district_lr_code,anumandal_subdiv_lr,anumandal_subdiv_lr_code,anchal_circle_lr_code,revenue_station_no_lr,total_kasara_lr,total_accounts_lr,statecode_census,statename_census,districtcode_census,districtname_census,subdistrictcode_census,subdistrictname_census,cdblockcode_census,cdblockname_census,grampanchayatcode_census,grampanchayatname_census,villagecode_census,villagename_census,_merge


In [127]:
# 7.9k (district, block, village) tuples appear only in the Bihar land record table (left table)
df_merged.query("_merge=='left_only'")

Unnamed: 0,zilla_district_lr,anchal_block_lr,mauja_village_lr,zilla_district_lr_code,anumandal_subdiv_lr,anumandal_subdiv_lr_code,anchal_circle_lr_code,revenue_station_no_lr,total_kasara_lr,total_accounts_lr,statecode_census,statename_census,districtcode_census,districtname_census,subdistrictcode_census,subdistrictname_census,cdblockcode_census,cdblockname_census,grampanchayatcode_census,grampanchayatname_census,villagecode_census,villagename_census,_merge
200,वैशाली,हाजीपुर,सहलादपुर मितवार चक,,,,,,,,,,,,,,,,,,,,left_only
440,वैशाली,हाजीपुर,चकरविया उर्फ रविया चक,,,,,,,,,,,,,,,,,,,,left_only
1901,गया,नगर,नगरपालिका वार्ड न0-2,,,,,,,,,,,,,,,,,,,,left_only
1911,गया,नगर,नगरपालीका वार्ड0 10,,,,,,,,,,,,,,,,,,,,left_only
2362,गया,अतरी,वनबा गोसाईमठ,,,,,,,,,,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35623,मुज़फ्फरपुर,सरैया,बैघा टोला,,,,,,,,,,,,,,,,,,,,left_only
35624,मुज़फ्फरपुर,सरैया,आनन्दपुर गंगोलिया,,,,,,,,,,,,,,,,,,,,left_only
35625,मुज़फ्फरपुर,सरैया,विशुनपुर दुवियाही,,,,,,,,,,,,,,,,,,,,left_only
35626,मुज़फ्फरपुर,सरैया,शादीक पुर,,,,,,,,,,,,,,,,,,,,left_only


In [132]:
"नगरपालीका वार्ड0 10" in df_xwalk.mauja_village_lr.unique()

False

In [133]:
"नगरपालीका वार्ड" in df_xwalk.mauja_village_lr.unique()

False