# Scrape 2011 Indian Census Data

# Population enumeramtion
[link](http://censusindia.gov.in/pca/pcadata/pca.html)

In [1]:
from bs4 import BeautifulSoup
import os
import requests

In [2]:
def make_soup(url):
    page = requests.get(url)
    return BeautifulSoup(page.content, 'html.parser')

In [3]:
output_root = "/Users/timothysweetser/Downloads/population"
root_url = "http://censusindia.gov.in/pca/pcadata"
top_url = "{}/{}".format(root_url, "pca.html")
n_provinces = 35 # manually counted these

In [4]:
top_soup = make_soup(top_url)

In [5]:
provinces = map(lambda x: x.get_text(), top_soup.findAll("td"))[12:-5]
assert len(provinces) == n_provinces

In [6]:
# extract links to each province's site
province_urls = [x['href'] for x in top_soup.findAll("a", href=True)]
province_urls = ["/".join(top_url.split("/")[:-1]) + "/%s" % x for x in province_urls]
print province_urls[0:4]
assert len(province_urls) == n_provinces

[u'http://censusindia.gov.in/pca/pcadata/Houselisting-housing-Andaman.html', u'http://censusindia.gov.in/pca/pcadata/Houselisting-housing-HARYANA.html', u'http://censusindia.gov.in/pca/pcadata/Houselisting-housing-NAGALAND.html', u'http://censusindia.gov.in/pca/pcadata/Houselisting-housing-AP.html']


In [7]:
# count number of subprovinces to make sure we download the right number of files
n_subprovinces = {}

In [9]:
# loop over provinces
for i in xrange(n_provinces):
    province = provinces[i]
    # load this province's site, e.g. http://censusindia.gov.in/pca/pcadata/Houselisting-housing-Andaman.html
    soup = make_soup(province_urls[i])
    # get links to each subprovince and count them
    subprovince_urls = soup.findAll("a", href=True)
    n_subprovinces[province] = len(subprovince_urls)
    
    # get the URL of the target excel sheet
    subprovince_urls = ["{}/{}".format(root_url, x["href"]) for x in subprovince_urls]
    excel_sheets = [requests.get(url) for url in subprovince_urls]
    
    # download each excel sheet and write to file
    for j in xrange(len(excel_sheets)):
        address = "{root}/{province}_{index}.xlsx".format(**{
                "root": output_root,
                "province": province.replace(" ", "_"),
                "index": j
                })
        print address
        output = open(address, 'wb')
        output.write(excel_sheets[j].content)
        output.close()

/Users/timothysweetser/Downloads/population/Andaman_and_Nicobar_Islands_UT_0.xlsx
/Users/timothysweetser/Downloads/population/Andaman_and_Nicobar_Islands_UT_1.xlsx
/Users/timothysweetser/Downloads/population/Andaman_and_Nicobar_Islands_UT_2.xlsx
/Users/timothysweetser/Downloads/population/Haryana_0.xlsx
/Users/timothysweetser/Downloads/population/Haryana_1.xlsx
/Users/timothysweetser/Downloads/population/Haryana_2.xlsx
/Users/timothysweetser/Downloads/population/Haryana_3.xlsx
/Users/timothysweetser/Downloads/population/Haryana_4.xlsx
/Users/timothysweetser/Downloads/population/Haryana_5.xlsx
/Users/timothysweetser/Downloads/population/Haryana_6.xlsx
/Users/timothysweetser/Downloads/population/Haryana_7.xlsx
/Users/timothysweetser/Downloads/population/Haryana_8.xlsx
/Users/timothysweetser/Downloads/population/Haryana_9.xlsx
/Users/timothysweetser/Downloads/population/Haryana_10.xlsx
/Users/timothysweetser/Downloads/population/Haryana_11.xlsx
/Users/timothysweetser/Downloads/population/

Make sure we downloaded the right number of files

In [19]:
n_files_on_disk = len([x for x in os.listdir(output_root) if ".xlsx" in x ])
n_files_on_web = reduce(lambda x, y: x+y, n_subprovinces.itervalues())
assert n_files_on_disk == n_files_on_web