# Project: API and Web Data Scraping

In [1]:
import quandl # package used to communicate with Quandl API
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [2]:
# initial url used to obtain zip codes, can be replaced by changing the 'st' and 'county' query parameters below
url = 'https://www.bestplaces.net/find/zip.aspx?st=fl&county=12086'

In [3]:
html = requests.get(url).content

In [4]:
soup = BeautifulSoup(html, 'lxml')

In [5]:
# extracts and stores zip code and neighborhood name
chowder = [element.text for element in list(soup.find_all('u')) if element.text.startswith('3')]

In [6]:
# split zips from neighborhoods and store into separate lists
zip_pattern = '(\d*) '
neighborhood_pattern = '\\((.+?)\\)'
zipCodes = list(map(lambda x: re.findall(zip_pattern, x), chowder))
neighborhoods = list(map(lambda x: re.findall(neighborhood_pattern, x), chowder))

In [7]:
# flatten lists
flatten = lambda l: [item for sublist in l for item in sublist if len(item) > 1]
zipCodes = flatten(zipCodes)
neighborhoods = flatten(neighborhoods)

In [24]:
# dictionary to reference later if necessary containing zip and corresponding neighborhood
ZIP_HOOD = dict(zip(zipCodes, neighborhoods))

In [9]:
# base url for population data
base_url = 'https://www.bestplaces.net/people/zip-code/florida/'
url_list = [base_url + neighborhood + '/' + zipCode for neighborhood, zipCode in zip(neighborhoods, zipCodes)]

In [10]:
# function scrapes a link and outputs any tables from html
def tableScrapeAndShape(link):
    '''takes a url and scrapes html tables into dataframes'''
    dataSoup = BeautifulSoup(requests.get(link).content, 'lxml').find_all('table')
    table = pd.read_html(str(dataSoup))[0]
    table.columns = table.iloc[0]
    table.set_index('PEOPLE', inplace=True)
    table = table.reindex(table.index.drop('PEOPLE'))
    table.index.name = None
    table = table.transpose()
    table.index.name = None
    return table

In [13]:
# generates table containing general information on miami-dade county neighborhoods
import multiprocessing
pool = multiprocessing.Pool()
bigTable = pd.concat(list(pool.map(tableScrapeAndShape, url_list)))
pool.terminate()
pool.join()


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
# united states population data is included as a reference, and concatenated each time. drop duplicates eliminates
# multiples of this same entry
bigTable = bigTable.drop_duplicates()

In [16]:
# examine table

bigTable.head()

Unnamed: 0,Asian,Black,Divorced,FAMILY,Family Households,Female Population,"Hawaiian, Pacific Islander",Hispanic,Household Size,Households,...,Population - 2000,Population - 2010,RACE,Single Population,"Single, no children","Single, w/children",Two or More Races,Water Area,White,Widowed
"Aventura, Florida",0.81%,3.56%,15.38%,,7684,51.88%,0.00%,36.36%,2.13,14872,...,20799,30840,,49.50%,10.27%,8.04%,1.10%,1,57.95%,8.66%
United States,5.29%,12.29%,10.85%,,78298703,50.77%,0.16%,17.60%,2.63,118825921,...,285036114,308745538,,49.80%,10.27%,16.35%,2.32%,264837,61.46%,5.83%
"Coral Gables, Florida",4.23%,4.65%,3.93%,,2800,50.47%,0.05%,46.78%,2.78,4018,...,13210,14995,,60.44%,6.29%,6.64%,1.71%,0,42.29%,3.69%
"Coral Gables, Florida",1.17%,0.30%,12.97%,,9930,52.95%,0.04%,79.23%,2.61,16251,...,34045,37456,,49.84%,14.86%,11.82%,0.75%,0,18.40%,6.96%
"Coral Terrace, Florida",1.16%,0.99%,14.57%,,10646,53.06%,0.00%,83.84%,3.22,14260,...,44142,43788,,51.61%,17.46%,13.48%,0.84%,1,13.08%,7.25%


In [22]:
# remove columns containing null entries
null_cols = bigTable.isnull().sum()

# set a threshold to remove columns with more than a quarter null  entries
threshold  = (0.25) * len(bigTable.index)
drop_cols = list(null_cols[null_cols > threshold].index)

In [23]:
# drop columns
bigTable = bigTable.drop(drop_cols, axis = 1)
bigTable.head(10)

Unnamed: 0,Asian,Black,Divorced,Family Households,Female Population,"Hawaiian, Pacific Islander",Hispanic,Household Size,Households,Land Area,...,Population - 1990,Population - 2000,Population - 2010,Single Population,"Single, no children","Single, w/children",Two or More Races,Water Area,White,Widowed
"Aventura, Florida",0.81%,3.56%,15.38%,7684,51.88%,0.00%,36.36%,2.13,14872,3,...,15498,20799,30840,49.50%,10.27%,8.04%,1.10%,1,57.95%,8.66%
United States,5.29%,12.29%,10.85%,78298703,50.77%,0.16%,17.60%,2.63,118825921,3531905,...,251960433,285036114,308745538,49.80%,10.27%,16.35%,2.32%,264837,61.46%,5.83%
"Coral Gables, Florida",4.23%,4.65%,3.93%,2800,50.47%,0.05%,46.78%,2.78,4018,3,...,14704,13210,14995,60.44%,6.29%,6.64%,1.71%,0,42.29%,3.69%
"Coral Gables, Florida",1.17%,0.30%,12.97%,9930,52.95%,0.04%,79.23%,2.61,16251,5,...,33349,34045,37456,49.84%,14.86%,11.82%,0.75%,0,18.40%,6.96%
"Coral Terrace, Florida",1.16%,0.99%,14.57%,10646,53.06%,0.00%,83.84%,3.22,14260,7,...,43251,44142,43788,51.61%,17.46%,13.48%,0.84%,1,13.08%,7.25%
"Country Club, Florida",1.00%,6.42%,12.36%,15547,51.33%,0.00%,84.45%,3.14,21053,6,...,30229,49279,63544,50.34%,14.26%,21.29%,0.13%,1,7.72%,3.95%
"Cutler Bay, Florida",1.37%,18.66%,15.97%,5732,48.35%,0.00%,60.55%,3.5,7763,6,...,16777,20280,23828,55.14%,13.42%,20.45%,0.56%,0,18.57%,3.37%
"Cutler Bay, Florida",2.83%,19.23%,14.41%,3817,51.34%,0.12%,58.65%,3.5,4478,2,...,1919,4820,11593,52.07%,14.57%,28.24%,1.91%,0,14.82%,2.75%
"Doral, Florida",2.80%,1.83%,11.47%,12197,52.88%,0.00%,83.30%,3.46,14287,56,...,5013,15272,39489,43.74%,9.52%,16.98%,0.10%,8,11.96%,3.04%
"Florida City, Florida",0.00%,31.80%,13.61%,3905,46.29%,0.00%,54.35%,4.0,4812,280,...,12141,15402,18613,64.79%,13.39%,35.42%,0.56%,24,13.05%,3.87%


In [37]:
# clean state name from index for easier matching later
bigTable = bigTable.rename(index=lambda x: x.replace(', Florida', ''))
bigTable.head(5)

Unnamed: 0,Asian,Black,Divorced,Family Households,Female Population,"Hawaiian, Pacific Islander",Hispanic,Household Size,Households,Land Area,...,Population - 1990,Population - 2000,Population - 2010,Single Population,"Single, no children","Single, w/children",Two or More Races,Water Area,White,Widowed
Aventura,0.81%,3.56%,15.38%,7684,51.88%,0.00%,36.36%,2.13,14872,3,...,15498,20799,30840,49.50%,10.27%,8.04%,1.10%,1,57.95%,8.66%
United States,5.29%,12.29%,10.85%,78298703,50.77%,0.16%,17.60%,2.63,118825921,3531905,...,251960433,285036114,308745538,49.80%,10.27%,16.35%,2.32%,264837,61.46%,5.83%
Coral Gables,4.23%,4.65%,3.93%,2800,50.47%,0.05%,46.78%,2.78,4018,3,...,14704,13210,14995,60.44%,6.29%,6.64%,1.71%,0,42.29%,3.69%
Coral Gables,1.17%,0.30%,12.97%,9930,52.95%,0.04%,79.23%,2.61,16251,5,...,33349,34045,37456,49.84%,14.86%,11.82%,0.75%,0,18.40%,6.96%
Coral Terrace,1.16%,0.99%,14.57%,10646,53.06%,0.00%,83.84%,3.22,14260,7,...,43251,44142,43788,51.61%,17.46%,13.48%,0.84%,1,13.08%,7.25%


## Zillow Home Value Index for Miami-Dade County
### Price of home to rent ratio

In [40]:
# dictionary of Zip codes and dataset summary codes

def pprZipCode(zipCode):
    '''uses zip code to obtain price to rent ratio using quandl and zillow API'''
    try:
        temp = (quandl.get('ZILLOW/Z' + zipCode + '_PRRAH', collapse='annual', order = 'desc')).rename(columns = {'Value' : ZIP_HOOD.get(zipCode)})
        temp.index.name = None
        temp = temp.transpose()
        temp.index.name = None
        return temp
    except:
        pass

In [74]:
# access Quandl api for price of home to rent ratios for each zip code
pool = multiprocessing.Pool()
pprTable = pd.concat(list(pool.map(pprZipCode, zipCodes)))
pool.terminate()
pool.join()

In [75]:
pprTable.head()

Unnamed: 0,2010-12-31 00:00:00,2011-12-31 00:00:00,2012-12-31 00:00:00,2013-12-31 00:00:00,2014-12-31 00:00:00,2015-12-31 00:00:00,2016-12-31 00:00:00,2017-12-31 00:00:00,2018-12-31 00:00:00,2019-12-31 00:00:00
Aventura,11.47,11.87,12.55,13.24,13.23,12.62,12.87,12.67,12.27,7.36
Coral Gables,12.86,13.5,14.02,15.03,15.29,14.69,15.12,15.18,15.52,2.82
Coral Gables,12.34,12.16,12.93,14.21,14.23,13.62,14.19,14.64,14.92,4.88
Coral Terrace,9.78,8.99,9.79,10.56,11.05,11.93,12.83,13.46,13.92,7.35
Country Club,6.93,6.82,7.33,7.87,9.08,9.44,10.08,11.21,11.46,10.41


In [76]:
colnames = [year for year in pprTable.columns.year]

In [77]:
pprTable.columns = colnames
#pprTable.columns = pd.MultiIndex.from_product([['Year'], colnames])

In [78]:
pprTable.head()

Unnamed: 0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Aventura,11.47,11.87,12.55,13.24,13.23,12.62,12.87,12.67,12.27,7.36
Coral Gables,12.86,13.5,14.02,15.03,15.29,14.69,15.12,15.18,15.52,2.82
Coral Gables,12.34,12.16,12.93,14.21,14.23,13.62,14.19,14.64,14.92,4.88
Coral Terrace,9.78,8.99,9.79,10.56,11.05,11.93,12.83,13.46,13.92,7.35
Country Club,6.93,6.82,7.33,7.87,9.08,9.44,10.08,11.21,11.46,10.41


In [109]:
# merge both tables on index

results = bigTable.join(pprTable)

In [110]:
results.head()

Unnamed: 0,Asian,Black,Divorced,Family Households,Female Population,"Hawaiian, Pacific Islander",Hispanic,Household Size,Households,Land Area,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Aventura,0.81%,3.56%,15.38%,7684,51.88%,0.00%,36.36%,2.13,14872,3,...,11.47,11.87,12.55,13.24,13.23,12.62,12.87,12.67,12.27,7.36
Coral Gables,4.23%,4.65%,3.93%,2800,50.47%,0.05%,46.78%,2.78,4018,3,...,12.86,13.5,14.02,15.03,15.29,14.69,15.12,15.18,15.52,2.82
Coral Gables,4.23%,4.65%,3.93%,2800,50.47%,0.05%,46.78%,2.78,4018,3,...,12.34,12.16,12.93,14.21,14.23,13.62,14.19,14.64,14.92,4.88
Coral Gables,1.17%,0.30%,12.97%,9930,52.95%,0.04%,79.23%,2.61,16251,5,...,12.86,13.5,14.02,15.03,15.29,14.69,15.12,15.18,15.52,2.82
Coral Gables,1.17%,0.30%,12.97%,9930,52.95%,0.04%,79.23%,2.61,16251,5,...,12.34,12.16,12.93,14.21,14.23,13.62,14.19,14.64,14.92,4.88


In [111]:
# remove columns containing null entries
null_cols = results.isnull().sum()

In [112]:
results.dropna(thresh=10, inplace=True)
results = results.fillna(0)

In [113]:
for column in results.columns:
    results[column] = results[column].astype('str')
    try:
        results[column] = results[column].str.rstrip('%').astype('float')
    except:
        pass

In [122]:
#results.dtypes

In [118]:
resultsFloat = results.groupby(results.index).mean()

In [119]:
resultsFloat.head()

Unnamed: 0,Asian,Black,Divorced,Family Households,Female Population,"Hawaiian, Pacific Islander",Hispanic,Household Size,Households,Land Area,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Aventura,0.81,3.56,15.38,7684.0,51.88,0.0,36.36,2.13,14872.0,3.0,...,11.47,11.87,12.55,13.24,13.23,12.62,12.87,12.67,12.27,7.36
Coral Gables,2.7,2.475,8.45,6365.0,51.71,0.045,63.005,2.695,10134.5,4.0,...,12.6,12.83,13.475,14.62,14.76,14.155,14.655,14.91,15.22,3.85
Coral Terrace,1.16,0.99,14.57,10646.0,53.06,0.0,83.84,3.22,14260.0,7.0,...,9.78,8.99,9.79,10.56,11.05,11.93,12.83,13.46,13.92,7.35
Country Club,1.0,6.42,12.36,15547.0,51.33,0.0,84.45,3.14,21053.0,6.0,...,6.93,6.82,7.33,7.87,9.08,9.44,10.08,11.21,11.46,10.41
Cutler Bay,2.1,18.945,15.19,4774.5,49.845,0.06,59.6,3.5,6120.5,4.0,...,6.92,6.76,7.035,8.47,9.355,10.015,10.875,11.505,12.375,7.125


In [121]:
resultsFloat.to_csv('output/results.csv', encoding='utf-8')