In [2]:
import geopandas as gpd
import numpy as np
import pandas as pd
import janitor
import json
from tqdm.notebook import tqdm

from utilities.utils import get_fulldata
from indicate import transliterate

## SHRUG

In [3]:
gdf_state = gpd.read_file("geometries_shrug-v1.5.samosa-open-polygons-gpkg/state.gpkg")
gdf_state.info()
bihar_state_id = gdf_state.set_index("state_name").at["Bihar", "pc11_state_id"]
gdf_state

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   pc11_state_id  35 non-null     object  
 1   state_name     35 non-null     object  
 2   geometry       35 non-null     geometry
dtypes: geometry(1), object(2)
memory usage: 968.0+ bytes


Unnamed: 0,pc11_state_id,state_name,geometry
0,1,Jammu and Kashmir,"POLYGON ((77.95837 35.48178, 77.96405 35.48433..."
1,2,Himachal Pradesh,"POLYGON ((76.80943 33.23872, 76.81593 33.23535..."
2,3,Punjab,"POLYGON ((75.83876 32.52156, 75.83898 32.52128..."
3,4,Chandigarh,"POLYGON ((76.79191 30.77115, 76.79229 30.77118..."
4,5,Uttarakhand,"POLYGON ((79.22439 31.34099, 79.22624 31.33888..."
5,6,Haryana,"POLYGON ((76.84307 30.88633, 76.84365 30.88618..."
6,7,NCT Of Delhi,"POLYGON ((77.07688 28.88184, 77.07801 28.88149..."
7,8,Rajasthan,"POLYGON ((73.90898 30.05334, 73.90437 30.05000..."
8,9,Uttar Pradesh,"MULTIPOLYGON (((79.36095 25.13890, 79.36217 25..."
9,10,Bihar,"MULTIPOLYGON (((84.51370 24.25774, 84.51182 24..."


In [4]:
gdf_districts = (
    gpd.read_file("geometries_shrug-v1.5.samosa-open-polygons-gpkg/district.gpkg")
    .query(f"pc11_state_id=='{bihar_state_id}'")
    .assign(district_name=lambda df: df["district_name"].str.lower())
    .reset_index(drop=True)
)
gdf_districts.info()
gdf_districts

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 38 entries, 304 to 341
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   pc11_state_id     38 non-null     object  
 1   pc11_district_id  38 non-null     object  
 2   district_name     38 non-null     object  
 3   geometry          38 non-null     geometry
dtypes: geometry(1), object(3)
memory usage: 1.5+ KB


Unnamed: 0,pc11_state_id,pc11_district_id,district_name,geometry
304,10,203,pashchim champaran,"POLYGON ((84.12469 27.51105, 84.12628 27.51085..."
305,10,204,purba champaran,"POLYGON ((84.83041 27.01919, 84.83128 27.01916..."
306,10,205,sheohar,"POLYGON ((85.32738 26.65375, 85.32803 26.65304..."
307,10,206,sitamarhi,"POLYGON ((85.66200 26.84586, 85.66625 26.84437..."
308,10,207,madhubani,"POLYGON ((86.06059 26.65650, 86.06645 26.65505..."
309,10,208,supaul,"POLYGON ((87.05293 26.55341, 87.05334 26.55334..."
310,10,209,araria,"POLYGON ((87.06975 26.56275, 87.06950 26.55288..."
311,10,210,kishanganj,"MULTIPOLYGON (((87.84553 25.98449, 87.84566 25..."
312,10,211,purnia,"POLYGON ((87.71802 26.11298, 87.71835 26.11297..."
313,10,212,katihar,"POLYGON ((87.82502 25.88259, 87.82553 25.88247..."


In [5]:
# Divisions?
gdf_subdistricts = (
    gpd.read_file("geometries_shrug-v1.5.samosa-open-polygons-gpkg/subdistrict.gpkg")
    .query(f"pc11_state_id=='{bihar_state_id}'")
    .assign(subdistrict_name=lambda df: df["subdistrict_name"].str.lower())
    .reset_index(drop=True)
)
gdf_subdistricts.info()
gdf_subdistricts.head(5)

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 534 entries, 2124 to 2657
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   pc11_state_id        534 non-null    object  
 1   pc11_district_id     534 non-null    object  
 2   pc11_subdistrict_id  534 non-null    object  
 3   subdistrict_name     534 non-null    object  
 4   geometry             534 non-null    geometry
dtypes: geometry(1), object(4)
memory usage: 25.0+ KB


Unnamed: 0,pc11_state_id,pc11_district_id,pc11_subdistrict_id,subdistrict_name,geometry
2124,10,203,1013,sidhaw,"POLYGON ((83.94735 27.44297, 83.94897 27.44205..."
2125,10,203,1014,ramnagar,"POLYGON ((84.12469 27.51105, 84.12628 27.51085..."
2126,10,203,1015,gaunaha,"POLYGON ((84.38576 27.37299, 84.39084 27.36994..."
2127,10,203,1016,mainatanr,"POLYGON ((84.65097 27.29101, 84.65572 27.28294..."
2128,10,203,1017,narkatiaganj,"POLYGON ((84.58513 27.19071, 84.58565 27.19002..."


In [60]:
gdf_villages = (
    gpd.read_file("geometries_shrug-v1.5.samosa-open-polygons-gpkg/village.gpkg")
    .query(f"pc11_state_id=='{bihar_state_id}'")
    .assign(town_village_name=lambda df: df["town_village_name"].str.lower())
    .reset_index(drop=True)
)
gdf_villages.info()
gdf_villages.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 45107 entries, 0 to 45106
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   pc11_state_id         45107 non-null  object  
 1   pc11_district_id      45107 non-null  object  
 2   pc11_subdistrict_id   45107 non-null  object  
 3   pc11_town_village_id  45107 non-null  object  
 4   town_village_name     45072 non-null  object  
 5   geometry              45107 non-null  geometry
dtypes: geometry(1), object(5)
memory usage: 2.1+ MB


Unnamed: 0,pc11_state_id,pc11_district_id,pc11_subdistrict_id,pc11_town_village_id,town_village_name,geometry
0,10,203,1013,215989,kalapani,"POLYGON ((83.95195 27.44241, 83.95057 27.44029..."
1,10,203,1013,215990,bhaisalotan,"POLYGON ((83.94735 27.44297, 83.94446 27.44172..."
2,10,203,1013,215991,tharhi,"POLYGON ((83.86745 27.43616, 83.87093 27.43540..."
3,10,203,1013,215992,pipra,"POLYGON ((83.90969 27.40805, 83.90759 27.40606..."
4,10,203,1013,215993,kotaraha,"POLYGON ((83.91060 27.39674, 83.90982 27.39623..."


## Divisions (subdistricts?): Land records

* https://github.com/soodoku/land/blob/main/scripts/00_summary_basic_bihar_land_record.ipynb

In [39]:
fp_divisions = "divisions.csv"
if False:
    # Get divisions
    df_divisions = (
        get_fulldata(**{'usecols': ["division"]})
        .drop_duplicates(ignore_index=True)
        .assign(eng=np.nan)
    )
    # Transliterate
    for ix, row in tqdm(df_divisions.iterrows()):
        df_divisions.at[ix, "eng"] = transliterate.hindi2english(row["division"])
        
    df_divisions.to_csv(fp_divisions, index=False)
    
df_divisions = pd.read_csv(fp_divisions)
df_divisions.head(3)

Unnamed: 0,division,eng
0,वैशाली,vaishali
1,पटेढ़ी बेलसर,patedhi beluser
2,लालगंज,lalganj


In [40]:
df_divisions[df_divisions.duplicated("eng", keep=False)]

Unnamed: 0,division,eng
83,संग्रमपुर,sangrampur
106,बाराहाट,barahat
177,बरहट,barahat
268,सा0 कमाल,
276,चे0 बरियारपुर,
366,बगहा -1,
450,संग्रामपुर,sangrampur


In [41]:
# Try merge with SHRUG
df_divisions = (
    df_divisions
    .dropna(subset=["eng"])
    .drop_duplicates("eng")
    .merge(gdf_subdistricts, how="outer", left_on="eng", right_on="subdistrict_name", validate="1:m", indicator=True)
)
df_divisions

Unnamed: 0,division,eng,pc11_state_id,pc11_district_id,pc11_subdistrict_id,subdistrict_name,geometry,_merge
0,वैशाली,vaishali,10,220,01268,vaishali,"POLYGON ((85.17616 26.00890, 85.17655 26.00858...",both
1,पटेढ़ी बेलसर,patedhi beluser,,,,,,left_only
2,लालगंज,lalganj,10,220,01270,lalganj,"POLYGON ((85.27043 25.91443, 85.26983 25.91342...",both
3,भगवानपुर,bhagwanpur,10,220,01271,bhagwanpur,"POLYGON ((85.28554 25.93287, 85.28608 25.93210...",both
4,भगवानपुर,bhagwanpur,10,222,01308,bhagwanpur,"POLYGON ((85.97766 25.62317, 85.97795 25.62281...",both
...,...,...,...,...,...,...,...,...
811,,,10,239,01537,kako,"POLYGON ((85.09374 25.28850, 85.09469 25.28781...",right_only
812,,,10,239,01538,modanganj,"POLYGON ((85.14576 25.31181, 85.14654 25.31180...",right_only
813,,,10,239,01540,makhdumpur,"POLYGON ((85.07797 25.15016, 85.07815 25.14854...",right_only
814,,,10,240,01542,arwal,"POLYGON ((84.70671 25.31483, 84.70909 25.31016...",right_only


In [46]:
df_divisions.query("_merge=='both'")

Unnamed: 0,division,eng,pc11_state_id,pc11_district_id,pc11_subdistrict_id,subdistrict_name,geometry,_merge
0,वैशाली,vaishali,10,220,01268,vaishali,"POLYGON ((85.17616 26.00890, 85.17655 26.00858...",both
2,लालगंज,lalganj,10,220,01270,lalganj,"POLYGON ((85.27043 25.91443, 85.26983 25.91342...",both
3,भगवानपुर,bhagwanpur,10,220,01271,bhagwanpur,"POLYGON ((85.28554 25.93287, 85.28608 25.93210...",both
4,भगवानपुर,bhagwanpur,10,222,01308,bhagwanpur,"POLYGON ((85.97766 25.62317, 85.97795 25.62281...",both
5,भगवानपुर,bhagwanpur,10,233,01455,bhagwanpur,"POLYGON ((83.62819 25.01638, 83.62923 25.01636...",both
...,...,...,...,...,...,...,...,...
503,औराई,aurai,10,216,01207,aurai,"POLYGON ((85.58092 26.39322, 85.58173 26.39088...",both
504,कटरा,katra,10,216,01208,katra,"POLYGON ((85.65221 26.32562, 85.65257 26.32386...",both
505,काँटी,kanti,10,216,01204,kanti,"MULTIPOLYGON (((85.38662 26.14974, 85.38624 26...",both
508,साहेबगंज,sahebganj,10,216,01199,sahebganj,"POLYGON ((85.01314 26.32546, 85.01443 26.32402...",both


In [47]:
df_divisions.query("_merge=='left_only'")

Unnamed: 0,division,eng,pc11_state_id,pc11_district_id,pc11_subdistrict_id,subdistrict_name,geometry,_merge
1,पटेढ़ी बेलसर,patedhi beluser,,,,,,left_only
7,विदुपुर,vidupur,,,,,,left_only
8,राधोपुर,radhopur,,,,,,left_only
12,चेहरा कलॉ,chehra kalau,,,,,,left_only
13,राजापाकर,rajapakar,,,,,,left_only
...,...,...,...,...,...,...,...,...
500,बंदरा,bandara,,,,,,left_only
501,बोचहाँ,bochahan,,,,,,left_only
506,पारु,paru,,,,,,left_only
507,मोतीपुर,motipur,,,,,,left_only


In [48]:
df_divisions.query("_merge=='right_only'")

Unnamed: 0,division,eng,pc11_state_id,pc11_district_id,pc11_subdistrict_id,subdistrict_name,geometry,_merge
511,,,10,203,01013,sidhaw,"POLYGON ((83.94735 27.44297, 83.94897 27.44205...",right_only
512,,,10,203,01014,ramnagar,"POLYGON ((84.12469 27.51105, 84.12628 27.51085...",right_only
513,,,10,203,01016,mainatanr,"POLYGON ((84.65097 27.29101, 84.65572 27.28294...",right_only
514,,,10,203,01017,narkatiaganj,"POLYGON ((84.58513 27.19071, 84.58565 27.19002...",right_only
515,,,10,203,01018,lauriya,"POLYGON ((84.37649 27.06930, 84.37713 27.06922...",right_only
...,...,...,...,...,...,...,...,...
811,,,10,239,01537,kako,"POLYGON ((85.09374 25.28850, 85.09469 25.28781...",right_only
812,,,10,239,01538,modanganj,"POLYGON ((85.14576 25.31181, 85.14654 25.31180...",right_only
813,,,10,239,01540,makhdumpur,"POLYGON ((85.07797 25.15016, 85.07815 25.14854...",right_only
814,,,10,240,01542,arwal,"POLYGON ((84.70671 25.31483, 84.70909 25.31016...",right_only


## Districts: Land records

In [55]:
fp_districts = "districts.csv"
if False:
    # Get district
    df_districts = (
        get_fulldata(**{'usecols': ["district"]})
        .drop_duplicates(ignore_index=True)
        .assign(eng=np.nan)
    )
    # Transliterate
    for ix, row in tqdm(df_districts.iterrows()):
        df_districts.at[ix, "eng"] = transliterate.hindi2english(row["district"])
        
    df_districts.to_csv(fp_districts, index=False)
    
df_districts = pd.read_csv(fp_districts)
df_districts.head(3)

0it [00:00, ?it/s]

Unnamed: 0,district,eng
0,वैशाली,vaishali
1,गया,gaya
2,रोहतास,rohtas


In [56]:
# Try merge with SHRUG
df_districts = (
    df_districts
#     .dropna(subset=["eng"])
#     .drop_duplicates("eng")
    .merge(gdf_districts, how="outer", left_on="eng", right_on="district_name", validate="1:1", indicator=True)
)
df_districts

Unnamed: 0,district,eng,pc11_state_id,pc11_district_id,district_name,geometry,_merge
0,वैशाली,vaishali,10.0,220.0,vaishali,"POLYGON ((85.25131 26.00914, 85.25170 26.00909...",both
1,गया,gaya,10.0,236.0,gaya,"MULTIPOLYGON (((84.51472 24.25770, 84.51427 24...",both
2,रोहतास,rohtas,10.0,234.0,rohtas,"POLYGON ((84.05529 25.37326, 84.05681 25.37270...",both
3,समस्तीपुर,samastipur,10.0,221.0,samastipur,"POLYGON ((85.71753 26.08220, 85.71798 26.08212...",both
4,मुंगेर,munger,10.0,226.0,munger,"POLYGON ((86.56461 25.49128, 86.56803 25.49122...",both
5,बक्सर,bucksre,,,,,left_only
6,अररिया,araria,10.0,209.0,araria,"POLYGON ((87.06975 26.56275, 87.06950 26.55288...",both
7,बांका,banka,10.0,225.0,banka,"POLYGON ((86.73864 25.12136, 86.73858 25.12072...",both
8,सुपौल,supaul,10.0,208.0,supaul,"POLYGON ((87.05293 26.55341, 87.05334 26.55334...",both
9,शिवहर,shivahar,,,,,left_only


In [58]:
df_districts.sort_values("_merge")

Unnamed: 0,district,eng,pc11_state_id,pc11_district_id,district_name,geometry,_merge
26,पूर्णियॉं,purniyaaayaaon,,,,,left_only
25,खगड़िया,khagriya,,,,,left_only
24,सीतामढ़ी,sitamadhi,,,,,left_only
33,कैमूर,camor,,,,,left_only
34,पूर्वी चम्पारण,purvi champaran,,,,,left_only
21,किशनगंज,kishenganj,,,,,left_only
20,अरवल,araval,,,,,left_only
19,बेगुसराए,begusaray,,,,,left_only
37,मुज़फ्फरपुर,muzfarpur,,,,,left_only
16,नवादा,navada,,,,,left_only


## Residence (villages?): Land Records

In [62]:
fp_residences = "residences.csv"
if True:
    # Get district
    df_residences = (
        get_fulldata(**{'usecols': ["district"]})
        .drop_duplicates(ignore_index=True)
        .assign(eng=np.nan)
    )
    # Transliterate
    for ix, row in tqdm(df_residences.iterrows()):
        df_residences.at[ix, "eng"] = transliterate.hindi2english(row["district"])
        
    df_residences.to_csv(fp_residences, index=False)
    
df_residences = pd.read_csv(fp_residences)
df_residences.head(3)

0it [00:00, ?it/s]

Unnamed: 0,district,eng
0,वैशाली,vaishali
1,गया,gaya
2,रोहतास,rohtas
