In [68]:
import pandas as pd
import numpy as np
import plotly.express as px

Datasets can be downloaded from the following links:

[Indian Census Data with Geospatial Indexing](https://www.kaggle.com/datasets/sirpunch/indian-census-data-with-geospatial-indexing)

[India Census 2011](https://www.kaggle.com/datasets/danofer/india-census)

In [45]:
latlong = pd.read_csv("datasets/Indian Census Data with Geospatial Indexing/district wise centroids.csv")
latlong.head()

Unnamed: 0,State,District,Latitude,Longitude
0,Andaman and Nicobar,Andaman Islands,12.382571,92.822911
1,Andaman and Nicobar,Nicobar Islands,7.835291,93.511601
2,Andhra Pradesh,Adilabad,19.284514,78.813212
3,Andhra Pradesh,Anantapur,14.312066,77.460158
4,Andhra Pradesh,Chittoor,13.331093,78.927639


In [46]:
latlong.shape[0]

594

In [47]:
latlong.isnull().sum()

State        0
District     0
Latitude     0
Longitude    0
dtype: int64

In [48]:
latlong.duplicated().sum()

np.int64(0)

In [49]:
census = pd.read_csv("datasets/India Census 2011/india-districts-census-2011.csv")
census

Unnamed: 0,District code,State name,District name,Population,Male,Female,Literate,Male_Literate,Female_Literate,SC,...,Power_Parity_Rs_90000_150000,Power_Parity_Rs_45000_150000,Power_Parity_Rs_150000_240000,Power_Parity_Rs_240000_330000,Power_Parity_Rs_150000_330000,Power_Parity_Rs_330000_425000,Power_Parity_Rs_425000_545000,Power_Parity_Rs_330000_545000,Power_Parity_Above_Rs_545000,Total_Power_Parity
0,1,JAMMU AND KASHMIR,Kupwara,870354,474190,396164,439654,282823,156831,1048,...,94,588,71,101,172,74,10,84,15,1119
1,2,JAMMU AND KASHMIR,Badgam,753745,398041,355704,335649,207741,127908,368,...,126,562,72,89,161,96,28,124,18,1066
2,3,JAMMU AND KASHMIR,Leh(Ladakh),133487,78971,54516,93770,62834,30936,488,...,46,122,15,22,37,20,14,34,17,242
3,4,JAMMU AND KASHMIR,Kargil,140802,77785,63017,86236,56301,29935,18,...,27,114,12,18,30,19,3,22,7,214
4,5,JAMMU AND KASHMIR,Punch,476835,251899,224936,261724,163333,98391,556,...,78,346,35,50,85,59,8,67,12,629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
635,636,PONDICHERRY,Mahe,41816,19143,22673,36470,16610,19860,144,...,2316,4309,1370,838,2208,576,978,1554,1446,10027
636,637,PONDICHERRY,Karaikal,200222,97809,102413,154916,79903,75013,35348,...,1063,2408,665,340,1005,246,483,729,341,4890
637,638,ANDAMAN AND NICOBAR ISLANDS,Nicobars,36842,20727,16115,25332,15397,9935,0,...,685,1895,212,134,346,70,120,190,84,3151
638,639,ANDAMAN AND NICOBAR ISLANDS,North AND Middle Andaman,105597,54861,50736,78683,43186,35497,0,...,685,1895,212,134,346,70,120,190,84,3151


In [50]:
# for checking the names of all the columns of census dataset
col_list = census.columns

with open('census_columns.txt', 'w') as file:
  for item in col_list:
    file.write(f"{item}\n")

In [51]:
# specifying columns in census
cols = ['District code', 'District name', 'Population', 'Male', 'Female', 'Literate', 'Households_with_Internet', 'Housholds_with_Electric_Lighting']
census = census[cols]

In [52]:
# merging census on latlong for including latitude and longitude
merged_df = latlong.merge(census, left_on='District', right_on='District name').drop(columns='District name')
merged_df

Unnamed: 0,State,District,Latitude,Longitude,District code,Population,Male,Female,Literate,Households_with_Internet,Housholds_with_Electric_Lighting
0,Andhra Pradesh,Adilabad,19.284514,78.813212,532,2741239,1369597,1371642,1483347,5512,566108
1,Andhra Pradesh,Anantapur,14.312066,77.460158,553,4081148,2064495,2016653,2310960,8606,897970
2,Andhra Pradesh,Chittoor,13.331093,78.927639,554,4174064,2090204,2083860,2667878,13763,944555
3,Andhra Pradesh,East Godavari,16.782718,82.243207,545,5154296,2569688,2584608,3288577,22411,1297273
4,Andhra Pradesh,Guntur,15.884926,80.586576,548,4887813,2440521,2447292,2960441,19383,1186028
...,...,...,...,...,...,...,...,...,...,...,...
512,West Bengal,Maldah,25.080268,88.171917,332,3988845,2051541,1937304,2086432,5357,297716
513,West Bengal,Murshidabad,24.259507,88.168169,333,7103807,3627564,3476243,4055834,7951,541169
514,West Bengal,Nadia,23.564110,88.582930,336,5167600,2653768,2513832,3480555,12787,623729
515,West Bengal,Puruliya,23.254192,86.396853,340,2930115,1496996,1433119,1624905,3551,189889


In [53]:
# introduce 2 new columns sex ratio (females per male) and literacy rate
merged_df['sex_ratio'] = (merged_df['Female'] / merged_df['Male']) * 100
merged_df['literacy_rate'] = (merged_df['Literate'] / merged_df['Population']) * 100
merged_df.head()

Unnamed: 0,State,District,Latitude,Longitude,District code,Population,Male,Female,Literate,Households_with_Internet,Housholds_with_Electric_Lighting,sex_ratio,literacy_rate
0,Andhra Pradesh,Adilabad,19.284514,78.813212,532,2741239,1369597,1371642,1483347,5512,566108,100.149314,54.112283
1,Andhra Pradesh,Anantapur,14.312066,77.460158,553,4081148,2064495,2016653,2310960,8606,897970,97.682629,56.625244
2,Andhra Pradesh,Chittoor,13.331093,78.927639,554,4174064,2090204,2083860,2667878,13763,944555,99.696489,63.915599
3,Andhra Pradesh,East Godavari,16.782718,82.243207,545,5154296,2569688,2584608,3288577,22411,1297273,100.580615,63.802642
4,Andhra Pradesh,Guntur,15.884926,80.586576,548,4887813,2440521,2447292,2960441,19383,1186028,100.277441,60.567804


In [54]:
# drop columns which are unrequired further
merged_df = merged_df.drop(columns=['Male', 'Female', 'Literate'])
merged_df.head()

Unnamed: 0,State,District,Latitude,Longitude,District code,Population,Households_with_Internet,Housholds_with_Electric_Lighting,sex_ratio,literacy_rate
0,Andhra Pradesh,Adilabad,19.284514,78.813212,532,2741239,5512,566108,100.149314,54.112283
1,Andhra Pradesh,Anantapur,14.312066,77.460158,553,4081148,8606,897970,97.682629,56.625244
2,Andhra Pradesh,Chittoor,13.331093,78.927639,554,4174064,13763,944555,99.696489,63.915599
3,Andhra Pradesh,East Godavari,16.782718,82.243207,545,5154296,22411,1297273,100.580615,63.802642
4,Andhra Pradesh,Guntur,15.884926,80.586576,548,4887813,19383,1186028,100.277441,60.567804


In [57]:
# export the current merged_df to a csv
merged_df.to_csv('datasets/India Census 2011 Clean/india_census_2011_clean.csv')

In [59]:
states_list = list(merged_df['State'].unique())
states_list.insert(0, 'India Overall')
states_list

['India Overall',
 'Andhra Pradesh',
 'Arunachal Pradesh',
 'Assam',
 'Bihar',
 'Chandigarh',
 'Chhattisgarh',
 'Daman and Diu',
 'Goa',
 'Gujarat',
 'Haryana',
 'Himachal Pradesh',
 'Jammu and Kashmir',
 'Jharkhand',
 'Karnataka',
 'Kerala',
 'Madhya Pradesh',
 'Maharashtra',
 'Manipur',
 'Meghalaya',
 'Mizoram',
 'Nagaland',
 'Orissa',
 'Puducherry',
 'Punjab',
 'Rajasthan',
 'Sikkim',
 'Tamil Nadu',
 'Tripura',
 'Uttar Pradesh',
 'Uttaranchal',
 'West Bengal']

In [67]:
sorted(merged_df.columns[5:])

['Households_with_Internet',
 'Housholds_with_Electric_Lighting',
 'Population',
 'literacy_rate',
 'sex_ratio']

In [74]:
fig = px.scatter_map(merged_df, lat='Latitude', lon='Longitude', zoom=3, map_style="carto-positron")
fig.show()