In [1]:
import pandas as pd
import os
import unicodedata

In [2]:
def sanitize_string(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    s = u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
    s = s.lower()
    s = s.replace(' ','_')
    s = s.replace('/','_')
    s = s.replace('&','')
    s = s.replace('(','_')
    s = s.replace(')','_')
    s = s.replace('____','_')
    s = s.replace('___','_')
    s = s.replace('__','_')
    if s[0] == '_': s = s[1:]
    if s[-1] == '_': s = s[:-1]
    return s

In [4]:
df = pd.read_csv(os.path.join('india_census','raw_india_district_census_2011.csv'))
df.columns = [sanitize_string(c) for c in df.columns]
df['name'] = df['name'].apply(sanitize_string)

In [5]:
df

Unnamed: 0,state,district,subdistt,town_village,ward,eb,level,name,tru,no_hh,...,marg_al_0_3_f,marg_hh_0_3_p,marg_hh_0_3_m,marg_hh_0_3_f,marg_ot_0_3_p,marg_ot_0_3_m,marg_ot_0_3_f,non_work_p,non_work_m,non_work_f
0,0,0,0,0,0,0,India,india,Total,249501663,...,5820403,1420348,491424,928924,5518450,3082613,2435837,728966109,291330383,437635726
1,0,0,0,0,0,0,India,india,Rural,168612897,...,5640822,1154975,386077,768898,3803400,1972348,1831052,485005760,200944045,284061715
2,0,0,0,0,0,0,India,india,Urban,80888766,...,179581,265373,105347,160026,1715050,1110265,604785,243960349,90386338,153574011
3,1,0,0,0,0,0,STATE,jammu_kashmir,Total,2119718,...,44570,26585,6780,19805,108457,56880,51577,8218589,3445572,4773017
4,1,0,0,0,0,0,STATE,jammu_kashmir,Rural,1553433,...,40936,21963,5757,16206,89232,45625,43607,5994979,2562471,3432508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,35,639,0,0,0,0,DISTRICT,north_middle_andaman,Rural,25475,...,119,30,21,9,1047,654,393,65311,23633,41678
2024,35,639,0,0,0,0,DISTRICT,north_middle_andaman,Urban,724,...,1,0,0,0,70,43,27,1707,616,1091
2025,35,640,0,0,0,0,DISTRICT,south_andaman,Total,59064,...,63,48,24,24,1311,933,378,141311,49720,91591
2026,35,640,0,0,0,0,DISTRICT,south_andaman,Rural,23767,...,62,42,20,22,777,550,227,57596,19971,37625


In [6]:
state_lookup = df.loc[df['level'] =='STATE', ['state','name']]
state_lookup = state_lookup.drop_duplicates().reset_index(drop=True)
state_lookup.columns = ['state_id','state_name']
state_lookup.to_csv('state_id_lookup.csv', index=False)
state_lookup

Unnamed: 0,state_id,state_name
0,1,jammu_kashmir
1,2,himachal_pradesh
2,3,punjab
3,4,chandigarh
4,5,uttarakhand
5,6,haryana
6,7,nct_of_delhi
7,8,rajasthan
8,9,uttar_pradesh
9,10,bihar


In [7]:
district_lookup = df.loc[df['level'] =='DISTRICT', ['district','name']]
district_lookup = district_lookup.drop_duplicates().reset_index(drop=True)
district_lookup.columns = ['district_id','district_name']
district_lookup.to_csv('district_id_lookup.csv', index=False)
district_lookup

Unnamed: 0,district_id,district_name
0,1,kupwara
1,2,badgam
2,3,leh_ladakh
3,4,kargil
4,5,punch
...,...,...
635,636,mahe
636,637,karaikal
637,638,nicobars
638,639,north_middle_andaman


In [None]:
district_pop = df.loc[(df['tru'] == 'Total') & (df['level'] == 'DISTRICT')]
district_pop = district_pop[['state','district','name','tot_p','no_hh']]
district_pop.columns = ['state_id','district_id','district_name','population','households']
district_pop = district_pop.merge(state_lookup, on='state_id')
district_pop = district_pop[['state_id','state_name','district_id','district_name','population','households']]
district_pop.to_csv('india_district_pop_2011.csv', index=False)
district_pop