In [1]:
import numpy as np
import pandas as pd
import pickle
import geopandas as gpd

Names of STATES and DISTRICTS from NFHS-5
-----

In [2]:
with open('../DATA/states.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    states = pickle.load(f)
    
with open('../DATA/districts.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    districts = pickle.load(f)
    
with open('../DATA/indicators.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    indicators = pickle.load(f)


In [3]:
print('STATES : # DISTRICTS (from NFHS-5 database)')
print('===========================================')
count = 0
idx = 1
for k in np.sort(list(districts.keys())):
    if len(districts[k])==0:
        districts[k] = [k]
    print(idx,':', k,':',len(districts[k]))
    count += len(districts[k])
    idx += 1
    
print(f'Total number of districts = {count}')

STATES : # DISTRICTS (from NFHS-5 database)
1 : Andaman_Nicobar_Islands : 3
2 : Andhra_Pradesh : 13
3 : Arunachal_Pradesh : 20
4 : Assam : 33
5 : Bihar : 38
6 : Chandigarh : 1
7 : Chhattisgarh : 27
8 : Dadra_Nagar_Haveli_Daman_Diu : 3
9 : Goa : 2
10 : Gujarat : 33
11 : Haryana : 22
12 : Himachal_Pradesh : 12
13 : Jammu_Kashmir : 20
14 : Jharkhand : 24
15 : Karnataka : 30
16 : Kerala : 14
17 : Ladakh : 2
18 : Lakshadweep : 1
19 : Madhya_Pradesh : 51
20 : Maharashtra : 36
21 : Manipur : 9
22 : Meghalaya : 11
23 : Mizoram : 8
24 : NCT_Delhi : 11
25 : Nagaland : 11
26 : Odisha : 30
27 : Puducherry : 4
28 : Punjab : 22
29 : Rajasthan : 33
30 : Sikkim : 4
31 : Tamil_Nadu : 32
32 : Telangana : 31
33 : Tripura : 8
34 : Uttar_Pradesh : 75
35 : Uttarakhand : 13
36 : West_Bengal : 20
Total number of districts = 707


Names of STATES and DISTRICTS from district-level shapefiles
-----

In [4]:
districts_shp=gpd.read_file("../DATA/DistShapeFiles/output.shp")

In [5]:
districts_shp.columns.values

array(['objectid', 'statecode', 'statename', 'state_ut', 'distcode',
       'distname', 'distarea', 'totalpopul', 'totalhh', 'totpopmale',
       'totpopfema', 'st_areasha', 'st_lengths', 'geometry'], dtype=object)

In [6]:
np.unique(list(districts_shp['statename']))

array(['Andaman & Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh',
       'Assam', 'Bihar', 'Chandigarh', 'Chhatisgarh',
       'Dadra & Nagar Haveli', 'Daman & Diu', 'Goa', 'Gujarat', 'Haryana',
       'Himachal Pradesh', 'Jammu & Kashmir', 'Jharkhand', 'Karnataka',
       'Kerala', 'Ladakh', 'Lakshadweep', 'Madhya Pradesh', 'Maharashtra',
       'Manipur', 'Meghalaya', 'Mizoram', 'NCT of Delhi', 'Nagaland',
       'Orissa', 'Pondicherry', 'Punjab', 'Rajasthan', 'Sikkim',
       'Tamil Nadu', 'Telangana', 'Tripura', 'Uttar Pradesh',
       'Uttarakhand', 'West Bengal'], dtype='<U25')

In [7]:
# Based on above cell, rename states to match names used in NFHS-5
rename_state = {
    'Andaman & Nicobar Islands' : 'Andaman_Nicobar_Islands',
    'Andhra Pradesh' : 'Andhra_Pradesh',
    'Arunachal Pradesh' :  'Arunachal_Pradesh',
    'Chhatisgarh' : 'Chhattisgarh',
    'Dadra & Nagar Haveli' : 'Dadra_Nagar_Haveli_Daman_Diu',
    'Daman & Diu' : 'Dadra_Nagar_Haveli_Daman_Diu',
    'Himachal Pradesh' : 'Himachal_Pradesh',
    'Jammu & Kashmir' : 'Jammu_Kashmir',
    'Madhya Pradesh' : 'Madhya_Pradesh',
    'NCT of Delhi' : 'NCT_Delhi',
    'Tamil Nadu' : 'Tamil_Nadu',
    'Uttar Pradesh' : 'Uttar_Pradesh',
    'West Bengal' : 'West_Bengal',
    'Orissa' : 'Odisha',
    'Pondicherry' : 'Puducherry'
}

In [8]:
# Rename state names using dictionary rename_state
districts_shp['statename'] = districts_shp['statename'].replace(rename_state)

In [9]:
# Print district names for each state in shapefile
# Also print if the same district name exists in NFHS-5 database
# Count the number of districts in shapefile that do not match NFHS-5 entries

for st in np.unique(districts_shp['statename']):
    count = 0
    count_false = 0
    for i, j in zip(districts_shp['statename'], districts_shp['distname']):
        if i == st:
           print(i,':', j, ':', j in districts[st])
           count_false += (j not in districts[st])
           count += 1
    print(f'Num districts in shapefile = {count}')
    print(f'Num false = {count_false}')
    print('NFHS-5 districts for ', st, ':', districts[st], len(districts[st]))
    print('\n')

Andaman_Nicobar_Islands : North  & Middle Andaman : False
Andaman_Nicobar_Islands : South Andaman : True
Andaman_Nicobar_Islands : Nicobars : False
Num districts in shapefile = 3
Num false = 2
NFHS-5 districts for  Andaman_Nicobar_Islands : ['Nicobar', 'North & Middle Andaman', 'South Andaman'] 3


Andhra_Pradesh : Prakasam : True
Andhra_Pradesh : Krishna : True
Andhra_Pradesh : Kurnool : True
Andhra_Pradesh : Guntur : True
Andhra_Pradesh : Vizianagaram : True
Andhra_Pradesh : Anantapur : True
Andhra_Pradesh : West Godavari : True
Andhra_Pradesh : Kadapa(YSR) : False
Andhra_Pradesh : East Godavari : True
Andhra_Pradesh : Visakhapatnam : True
Andhra_Pradesh : Sri Potti Sriramulu Nellore : True
Andhra_Pradesh : Chittoor : True
Andhra_Pradesh : Srikakulam : True
Num districts in shapefile = 13
Num false = 1
NFHS-5 districts for  Andhra_Pradesh : ['Anantapur', 'Chittoor', 'East Godavari', 'Guntur', 'Krishna', 'Kurnool', 'Prakasam', 'Sri Potti Sriramulu Nellore', 'Srikakulam', 'Visakhapatna

Jharkhand : Dumka : True
Num districts in shapefile = 24
Num false = 0
NFHS-5 districts for  Jharkhand : ['Bokaro', 'Chatra', 'Deoghar', 'Dhanbad', 'Dumka', 'Garhwa', 'Giridih', 'Godda', 'Gumla', 'Hazaribagh', 'Jamtara', 'Khunti', 'Kodarma', 'Latehar', 'Lohardaga', 'Pakur', 'Palamu', 'Pashchimi Singhbhum', 'Purbi Singhbhum', 'Ramgarh', 'Ranchi', 'Sahibganj', 'Saraikela-Kharsawan', 'Simdega'] 24


Karnataka : Mandya : True
Karnataka : Mysore : True
Karnataka : Chikkaballapura : True
Karnataka : Yadgir : True
Karnataka : Chikmagalur : True
Karnataka : Chitradurga : True
Karnataka : Haveri : True
Karnataka : Dakshina Kannada : True
Karnataka : Raichur : True
Karnataka : Kolar : True
Karnataka : Bijapur : True
Karnataka : Uttara Kannada : True
Karnataka : Davanagere : True
Karnataka : Dharwad : True
Karnataka : Bidar : True
Karnataka : Chamarajanagar : True
Karnataka : Gulbarga : True
Karnataka : Gadag : True
Karnataka : Udupi : True
Karnataka : Bagalkot : True
Karnataka : Hassan : True
Ka

Nagaland : Wokha : True
Nagaland : Zunheboto : True
Nagaland : Kiphire : True
Nagaland : Dimapur : True
Nagaland : Peren : True
Nagaland : Mon : True
Nagaland : Tuensang : True
Nagaland : Mokokchung : True
Nagaland : Longleng : True
Nagaland : Phek : True
Num districts in shapefile = 11
Num false = 0
NFHS-5 districts for  Nagaland : ['Dimapur', 'Kiphire', 'Kohima', 'Longleng', 'Mokokchung', 'Mon', 'Peren', 'Phek', 'Tuensang', 'Wokha', 'Zunheboto'] 11


Odisha : Anugul : True
Odisha : Koraput : True
Odisha : Nabarangapur : True
Odisha : Bargarh : True
Odisha : Balangir : True
Odisha : Subarnapur : True
Odisha : Puri : True
Odisha : Debagarh : True
Odisha : Jagatsinghapur : True
Odisha : Kandhamal : True
Odisha : Khordha : True
Odisha : Mayurbhanj : True
Odisha : Malkangiri : True
Odisha : Baleshwar : True
Odisha : Sambalpur : True
Odisha : Gajapati : True
Odisha : Dhenkanal : True
Odisha : Nayagarh : True
Odisha : Cuttack : True
Odisha : Jajapur : True
Odisha : Kalahandi : True
Odisha :

West_Bengal : Jalpaiguri : True
West_Bengal : Uttar Dinajpur : True
West_Bengal : Purba Medinipur : True
West_Bengal : Haora : True
West_Bengal : North Twenty Four Parganas : True
West_Bengal : Birbhum : True
Num districts in shapefile = 19
Num false = 2
NFHS-5 districts for  West_Bengal : ['Bankura', 'Birbhum', 'Dakshin Dinajpur', 'Darjeeling', 'Haora', 'Hugli', 'Jalpaiguri', 'Koch Bihar', 'Kolkata', 'Maldah', 'Murshidabad', 'Nadia', 'North Twenty Four Parganas', 'Paschim Barddhaman', 'Paschim Medinipur', 'Purba Barddhaman', 'Purba Medinipur', 'Puruliya', 'South Twenty Four Parganas', 'Uttar Dinajpur'] 20




In [10]:
# Based on output of above cell, correct district names in shapefile
correct_distnames = {
    'Kadapa(YSR)' : 'Y.S.R.',                 # AP
    'Papum Pare' : 'Papumpare',               # AR
    'Bemetra' : 'Bemetara',                   # CT
    'Gariaband' : 'Gariyaband',               # CT
    'Dakshin Bastar Dantewada' : 'Dantewada', # CT: same according to Wikipedia
    'Kondagaon' : 'Kodagaon',                 # CT
    'Sabar Kantha' : 'Sabarkantha',           # GJ
    'Banas Kantha' : 'Banaskantha',           # GJ
    'Dohad' : 'Dahod',                        # GJ
    'Ahmadabad' : 'Ahmedabad',                # GJ
    'Batod' : 'Botad',                        # GJ
    'Panch Mahals' : 'Panchmahal',            # GJ
    'Lahul & Spiti' : 'Lahul and Spiti',      # HP
    'Narsimhapur' : 'Narsinghpur',            # MP
    'Ahmadnagar' : 'Ahmednagar',              # MH
    'Buldana' : 'Buldhana',                   # MH
    'Thoubal' : 'Toubal',                     # Manipur
    'Shahid Bhagat Singh Nagar' : 'SBS Nagar', # PJ
    'Gurdaspur' : 'Gurudaspur',               # PJ
    'Warangal (R)' : 'Warangal Rural',        # TG
    'Jangaon' : 'Jangoan',                    # TG
    'Bhadradri' : 'Bhadradri Kothagudem',     # TG
    'Hydrabad' : 'Hyderabad',                 # TG
    'Rangareddy' : 'Ranga Reddy',             # TG
    'Komaram Bheem' : 'Komaram Bheem Asifabad', # TG
    'Jagtial' : 'Jagitial',                   # TG
    'Warangal (U)' : 'Warangal Urban',        # TG
    'Jayashankar' : 'Jayashankar Bhupalapally', # TG
    'Jogulamba' : 'Jogulamba Gadwal',         # TG 
    'Yadadri' : 'Yadadri Bhuvanagiri',        # TG
    'Medchal' : 'Medchal-Malkajgiri',         # TG
    'Unokoti' : 'Unakoti',                    # TR
    'Sipahijula' : 'Sepahijala',              # TR 
    'Sant Kabir Nagar' : 'Sant Kabeer Nagar', # UP
    'Bara Banki' : 'Barabanki',               # UP
    'Mahrajganj' : 'Maharajganj',             # UP
    'Samli' : 'Shamli',                       # UP
    'Shrawasti' : 'Shravasti',                # UP
    'Allahabad' : 'Prayagraj',                # UP
    'Garhwal' : 'Pauri Garhwal',              # UT
    'Hardwar' : 'Haridwar',                   # UT
    'Udham Singh Nagar' : 'Udam Singh Nagar', # UT
    #'Barddhaman' : 'Barddhaman',             # WB: Barddhaman in shapefile is split into 'Purba Barddhaman' and 'Paschim Barddhaman' in NFHS-5
    'Darjiling' : 'Darjeeling',               # WB
    'North  & Middle Andaman' : 'North & Middle Andaman', #AN
    'Nicobars' : 'Nicobar'                    #AN
}

In [11]:
districts_shp['distname'] = districts_shp['distname'].replace(correct_distnames)

In [12]:
D_shp = {}
for i, j in zip(districts_shp['statename'], districts_shp['distname']):
    #print(i, j)
    if i in D_shp.keys():
        D_shp[i].append(j)
    else:
        D_shp[i] = [j]

count = 0
idx = 1
srn = 'No.'
state = 'State'
dcount_nfhs5 = '# districts (from NFHS5)'
dcount_shp = '# districts (from shp)'
print(f'{srn:>4} | {state:>29} | {dcount_shp:>24} | {dcount_nfhs5:>24} | diff')
print('='*99)
for k in np.sort(list(D_shp.keys())):
    if len(D_shp[k])==0:
        D_shp[k] = [k]
    diff = len(districts[k]) - len(D_shp[k])
    print(f'{idx:>4} | {k:>29} | {len(D_shp[k]):>24} | {len(districts[k]):>24} | {diff:>3}')
    count += len(D_shp[k])
    idx += 1
    
print(f'Total number of districts = {count}')        

 No. |                         State |   # districts (from shp) | # districts (from NFHS5) | diff
   1 |       Andaman_Nicobar_Islands |                        3 |                        3 |   0
   2 |                Andhra_Pradesh |                       13 |                       13 |   0
   3 |             Arunachal_Pradesh |                       16 |                       20 |   4
   4 |                         Assam |                       27 |                       33 |   6
   5 |                         Bihar |                       38 |                       38 |   0
   6 |                    Chandigarh |                        1 |                        1 |   0
   7 |                  Chhattisgarh |                       27 |                       27 |   0
   8 |  Dadra_Nagar_Haveli_Daman_Diu |                        3 |                        3 |   0
   9 |                           Goa |                        2 |                        2 |   0
  10 |                       

In [13]:
districts_shp['statedist'] = districts_shp['distname']+districts_shp['statename']

In [14]:
districts_shp.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 693 entries, 0 to 692
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   objectid    693 non-null    int64   
 1   statecode   693 non-null    object  
 2   statename   693 non-null    object  
 3   state_ut    693 non-null    object  
 4   distcode    693 non-null    object  
 5   distname    693 non-null    object  
 6   distarea    629 non-null    float64 
 7   totalpopul  660 non-null    float64 
 8   totalhh     660 non-null    float64 
 9   totpopmale  660 non-null    float64 
 10  totpopfema  660 non-null    float64 
 11  st_areasha  693 non-null    float64 
 12  st_lengths  693 non-null    float64 
 13  geometry    693 non-null    geometry
 14  statedist   693 non-null    object  
dtypes: float64(7), geometry(1), int64(1), object(6)
memory usage: 81.3+ KB


## Merge a DataFrame 'NFHS-5_imputed' into GeoDataFrame 'districts' on shared variable 'distname'

In [15]:
df_nfhs5 = pd.read_csv('../DATA/NFHS5_imputed.csv', index_col = 0)
df_nfhs5

Unnamed: 0,State,District,Q1_NFHS5,Q2_NFHS5,Q3_NFHS5,Q4_NFHS5,Q5_NFHS5,Q6_NFHS5,Q7_NFHS5,Q8_NFHS5,...,TF_Q95,TF_Q96,TF_Q97,TF_Q98,TF_Q99,TF_Q100,TF_Q101,TF_Q102,TF_Q103,TF_Q104
0,Andhra_Pradesh,Anantapur,59.5,24.3,1047.0,881.0,94.3,87.2,99.6,98.8,...,YES,YES,YES,YES,YES,YES,YES,YES,YES,YES
1,Andhra_Pradesh,Chittoor,65.6,22.6,1024.0,1019.0,94.7,74.3,99.7,98.5,...,YES,YES,YES,YES,YES,YES,YES,YES,YES,YES
2,Andhra_Pradesh,East Godavari,75.4,20.5,995.0,882.0,93.0,68.2,98.8,97.9,...,YES,YES,YES,YES,YES,YES,YES,YES,YES,YES
3,Andhra_Pradesh,Guntur,64.9,22.4,1055.0,941.0,92.3,82.5,99.2,99.3,...,YES,YES,YES,YES,YES,YES,YES,YES,YES,YES
4,Andhra_Pradesh,Krishna,74.0,20.4,1064.0,1139.0,96.4,86.3,99.6,94.4,...,YES,YES,YES,YES,YES,YES,YES,YES,YES,YES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
700,Ladakh,Leh (Ladakh),66.3,21.0,967.0,949.0,98.7,77.7,99.8,87.1,...,YES,YES,YES,YES,YES,YES,YES,YES,YES,YES
701,Puducherry,Karaikal,86.3,21.9,1132.0,884.0,99.1,95.9,99.6,99.9,...,YES,YES,YES,YES,YES,YES,YES,YES,YES,YES
702,Puducherry,Mahe,99.2,16.8,1164.0,1202.0,99.4,96.8,99.7,98.0,...,YES,YES,YES,YES,YES,YES,YES,YES,YES,YES
703,Puducherry,Puducherry,83.8,20.1,1106.0,963.0,99.6,91.3,100.0,100.0,...,YES,YES,YES,YES,YES,YES,YES,YES,YES,YES


In [16]:
df_nfhs5['statedist'] = df_nfhs5['District']+df_nfhs5['State']

In [17]:
df_nfhs5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 705 entries, 0 to 704
Columns: 193 entries, State to statedist
dtypes: float64(95), object(98)
memory usage: 1.0+ MB


In [18]:
districts_new = districts_shp.merge(df_nfhs5, on='statedist')

In [19]:
districts_new.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 689 entries, 0 to 688
Columns: 207 entries, objectid to TF_Q104
dtypes: float64(102), geometry(1), int64(1), object(103)
memory usage: 1.1+ MB


In [20]:
list(districts_new.columns)

['objectid',
 'statecode',
 'statename',
 'state_ut',
 'distcode',
 'distname',
 'distarea',
 'totalpopul',
 'totalhh',
 'totpopmale',
 'totpopfema',
 'st_areasha',
 'st_lengths',
 'geometry',
 'statedist',
 'State',
 'District',
 'Q1_NFHS5',
 'Q2_NFHS5',
 'Q3_NFHS5',
 'Q4_NFHS5',
 'Q5_NFHS5',
 'Q6_NFHS5',
 'Q7_NFHS5',
 'Q8_NFHS5',
 'Q9_NFHS5',
 'Q10_NFHS5',
 'Q11_NFHS5',
 'Q12_NFHS5',
 'Q13_NFHS5',
 'Q14_NFHS5',
 'Q15_NFHS5',
 'Q16_NFHS5',
 'Q17_NFHS5',
 'Q18_NFHS5',
 'Q19_NFHS5',
 'Q20_NFHS5',
 'Q21_NFHS5',
 'Q22_NFHS5',
 'Q23_NFHS5',
 'Q24_NFHS5',
 'Q25_NFHS5',
 'Q26_NFHS5',
 'Q27_NFHS5',
 'Q28_NFHS5',
 'Q29_NFHS5',
 'Q30_NFHS5',
 'Q31_NFHS5',
 'Q32_NFHS5',
 'Q33_NFHS5',
 'Q34_NFHS5',
 'Q35_NFHS5',
 'Q36_NFHS5',
 'Q37_NFHS5',
 'Q38_NFHS5',
 'Q39_NFHS5',
 'Q41_NFHS5',
 'Q42_NFHS5',
 'Q43_NFHS5',
 'Q44_NFHS5',
 'Q45_NFHS5',
 'Q46_NFHS5',
 'Q48_NFHS5',
 'Q49_NFHS5',
 'Q50_NFHS5',
 'Q51_NFHS5',
 'Q52_NFHS5',
 'Q53_NFHS5',
 'Q54_NFHS5',
 'Q55_NFHS5',
 'Q56_NFHS5',
 'Q57_NFHS5',
 'Q58_NFH

In [21]:
for i in range(districts_new.shape[0]):
    if districts_new.iloc[i]['statename'] != districts_new.iloc[i]['State'] or \
       districts_new.iloc[i]['distname'] != districts_new.iloc[i]['District']:
        print(i, 'District mismatched!')


In [22]:
print('Districts in NFHS-5 that are not listed in the shapefile:')
count = 1
for i in range(df_nfhs5.shape[0]):
    if df_nfhs5.iloc[i]['statedist'] not in list(districts_shp['statedist']):
        print(f"{count:>3} {df_nfhs5.iloc[i]['State']:>18} {df_nfhs5.iloc[i]['District']:>25}")
        count += 1


Districts in NFHS-5 that are not listed in the shapefile:
  1  Arunachal_Pradesh                 Kra Daadi
  2  Arunachal_Pradesh                  Longding
  3  Arunachal_Pradesh                    Namsai
  4  Arunachal_Pradesh                     Siang
  5              Assam                 Biswanath
  6              Assam                 Charaideo
  7              Assam                     Hojai
  8              Assam                    Majuli
  9              Assam   South Salmara Mancachar
 10              Assam        West Karbi Anglong
 11            Haryana             Charkhi Dadri
 12     Madhya_Pradesh                Agar Malwa
 13        West_Bengal        Paschim Barddhaman
 14        West_Bengal          Purba Barddhaman
 15          NCT_Delhi                  Shahdara
 16          NCT_Delhi                South East


In [23]:
print('Districts in the shapefile that are not listed in NFHS-5:')
count = 1
for i in range(districts_shp.shape[0]):
    if districts_shp.iloc[i]['statedist'] not in list(df_nfhs5['statedist']):
        print(f"{count:>3} {districts_shp.iloc[i]['statename']:>18} {districts_shp.iloc[i]['distname']:>25}")
        count += 1


Districts in the shapefile that are not listed in NFHS-5:
  1      Jammu_Kashmir        DATA NOT AVAILABLE
  2        Lakshadweep               Lakshadweep
  3        West_Bengal                Barddhaman
  4         Chandigarh                Chandigarh


Write Merged DF To Shapefile
----

In [24]:
districts_new.to_file("../DATA/NFHS5_districts.shp")