# Scrape 2011 Indian Census Data

# Houselisting
[link](http://www.censusindia.gov.in/2011census/HLO/HL_PCA/Houselisting-housing-Gujarat.html)

In [1]:
from bs4 import BeautifulSoup
import os
import requests

In [2]:
def make_soup(url):
    page = requests.get(url)
    return BeautifulSoup(page.content, 'html.parser')

In [3]:
output_root = "/Users/timothysweetser/Downloads/houselisting"
os.mkdir(output_root)
root_url = "http://www.censusindia.gov.in/2011census/HLO/HL_PCA"
top_url = "{}/{}".format(root_url, "Houselisting-housing-HLPCA.html")
n_provinces = 35 # manually counted these

In [4]:
top_soup = make_soup(top_url)

In [6]:
provinces = map(lambda x: x.get_text(), top_soup.findAll("td"))[12:-5]
assert len(provinces) == n_provinces
print provinces

[u'Andaman and Nicobar Islands UT', u'Haryana', u'Nagaland', u'Andhra Pradesh ', u'Himachal Pradesh', u'Odisha', u'Arunachal Pradesh', u'Jammu & Kashmir', u'Puducherry UT', u'Assam', u'Jharkhand', u' Punjab', u' Bihar', u'Karnataka', u'Rajasthan ', u' Chandigarh UT', u'Kerala', u'Sikkim', u' Chhattisgarh', u'Lakshadweep UT', u'Tamil Nadu', u'Dadra & Nagar Haveli UT', u'Madhya Pradesh ', u'Tripura', u'Daman & Diu UT ', u'Maharashtra ', u'Uttar Pradesh', u'NCT of Delhi', u'Manipur', u'Uttarakhand', u' Goa', u'Meghalaya', u' West Bengal', u'Gujarat', u'Mizoram']


In [7]:
# extract links to each province's site
province_urls = [x['href'] for x in top_soup.findAll("a", href=True)]
province_urls = ["/".join(top_url.split("/")[:-1]) + "/%s" % x for x in province_urls]
print province_urls[0:4]
assert len(province_urls) == n_provinces

[u'http://www.censusindia.gov.in/2011census/HLO/HL_PCA/Houselisting-housing-Andaman.html', u'http://www.censusindia.gov.in/2011census/HLO/HL_PCA/Houselisting-housing-HARYANA.html', u'http://www.censusindia.gov.in/2011census/HLO/HL_PCA/Houselisting-housing-NAGALAND.html', u'http://www.censusindia.gov.in/2011census/HLO/HL_PCA/Houselisting-housing-AP.html']


In [9]:
# count number of subprovinces to make sure we download the right number of files
n_subprovinces = {}

In [10]:
# loop over provinces
for i in xrange(n_provinces):
    province = provinces[i]
    print province
    # load this province's site, e.g. http://censusindia.gov.in/pca/pcadata/Houselisting-housing-Andaman.html
    soup = make_soup(province_urls[i])
    # get links to each subprovince and count them
    subprovince_urls = soup.findAll("a", href=True)
    n_subprovinces[province] = len(subprovince_urls)
    
    # get the URL of the target excel sheet
    subprovince_urls = ["{}/{}".format(root_url, x["href"]) for x in subprovince_urls]
    excel_sheets = [requests.get(url) for url in subprovince_urls]
    
    # download each excel sheet and write to file
    for j in xrange(len(excel_sheets)):
        address = "{root}/{province}_{index}.xlsx".format(**{
                "root": output_root,
                "province": province.replace(" ", "_"),
                "index": j
                })
        output = open(address, 'wb')
        output.write(excel_sheets[j].content)
        output.close()

Andaman and Nicobar Islands UT
Haryana
Nagaland
Andhra Pradesh 
Himachal Pradesh
Odisha
Arunachal Pradesh
Jammu & Kashmir
Puducherry UT
Assam
Jharkhand
 Punjab
 Bihar
Karnataka
Rajasthan 
 Chandigarh UT
Kerala
Sikkim
 Chhattisgarh
Lakshadweep UT
Tamil Nadu
Dadra & Nagar Haveli UT
Madhya Pradesh 
Tripura
Daman & Diu UT 
Maharashtra 
Uttar Pradesh
NCT of Delhi
Manipur
Uttarakhand
 Goa
Meghalaya
 West Bengal
Gujarat
Mizoram


Make sure we downloaded the right number of files

In [11]:
n_files_on_disk = len([x for x in os.listdir(output_root) if ".xlsx" in x ])
n_files_on_web = reduce(lambda x, y: x+y, n_subprovinces.itervalues())
assert n_files_on_disk == n_files_on_web