In [6]:
import pandas as pd
import urllib
import csv

from bs4 import BeautifulSoup

In [7]:
# constants
baseUrl = 'https://texas.hometownlocator.com/zip-codes/data,zipcode,#####.cfm'
type1Fields = [
    'Total Population',
    'Population Density']
type2Fields = [
    'Population']

In [9]:
# load the datasets
housing_df = pd.read_csv('data/raw_austin_housing_data_w_crime.csv', low_memory=False)

# get list of zipcodes
zipCodes = list(housing_df['zip'].unique().astype(int))
zipCodes = [ str(int(zip)) for zip in zipCodes ]

In [10]:
# iterate through zip codes and extract data
zipStats = [ ['Zip'] + type1Fields + type2Fields + [(field + '_proj') for field in type2Fields] ]
count = 0
for zipCode in zipCodes:
    count += 1
    print('\rprocessing {} of {}.'.format(count, len(zipCodes)), end="\r")
    zipData = [ zipCode ]
    
    # load the page
    url = baseUrl.replace('#####', zipCode)
    with urllib.request.urlopen(url) as page:
        soup = BeautifulSoup(page, 'html.parser')

        # parse type1 data fields
        for field in type1Fields:
            span = soup.find('span', string=field)
            if span == None:
                zipData.append('err')
            else:
                val = span.find_parent().find_next_sibling().get_text().replace(',', '')
                zipData.append(val)

        # parse type2 data fields current
        for field in type2Fields:
            span = soup.find('span', string=field, attrs={ 'class': 'indent' })
            if span == None:
                zipData.append('err')
            else:
                val = span.find_parent().find_next_sibling().get_text().replace('%', '')
                zipData.append(val)
        
        # parse type2 data fields projected
        for field in type2Fields:
            span = soup.find('span', string=field, attrs={ 'class': 'indent' })
            if span == None:
                zipData.append('err')
            else:
                val = span.find_parent().find_next_sibling().find_next_sibling().get_text().replace('%', '')
                zipData.append(val)

    zipStats.append(zipData)

processing 105 of 105.

In [11]:
# purge the bad zips
zipDataReduced = [ [ data[0], data[2] ] for data in zipStats if data[-1] != 'err' ]

# write to file
with open('./data/pop_density.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(zipDataReduced)

In [15]:
# build the population density column
popDensities = { data[0]: data[1] for data in zipDataReduced[1:] }
housing_df = housing_df[housing_df['zip'].notnull()]
housing_df['Population Density'] = housing_df['zip'].apply(lambda x: popDensities[str(int(x))] if str(int(x)) in popDensities else -1)

In [18]:
# write to file
housing_df.to_csv('./data/raw_austin_housing_data_w_crime_density.csv')