In [1]:
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd

## Scrape data and transform California demographics
Use requests and beautifulsoup to parse the California-demographics webpage. Loop through the table rows and place the data in a pandas dataframe

In [2]:
url = 'https://www.california-demographics.com/zip_codes_by_population'
response = requests.get(url, 'html.parser')
page_soup = soup(response.text)
population = []
for tr in page_soup.find_all('tr')[1:]:
    tds = tr.find_all('td')
    try:
        population.append([tds[1].text.strip()[:5], tds[2].text.strip()])
    except IndexError:
        pass
population

[['90011', '108,051'],
 ['90650', '106,404'],
 ['91331', '105,696'],
 ['90201', '102,878'],
 ['92335', '99,791'],
 ['90250', '97,371'],
 ['91342', '96,177'],
 ['90805', '96,069'],
 ['90280', '95,420'],
 ['90044', '94,680'],
 ['92503', '94,523'],
 ['92336', '94,327'],
 ['94565', '93,549'],
 ['92683', '91,758'],
 ['92704', '90,525'],
 ['91710', '88,862'],
 ['92804', '88,065'],
 ['95076', '87,781'],
 ['92154', '87,218'],
 ['91744', '86,982'],
 ['92376', '86,937'],
 ['94112', '85,373'],
 ['93307', '84,948'],
 ['93722', '84,481'],
 ['91911', '84,025'],
 ['93033', '83,572'],
 ['92592', '82,551'],
 ['92126', '81,883'],
 ['93727', '81,118'],
 ['92509', '81,093'],
 ['92345', '80,910'],
 ['91335', '79,687'],
 ['94544', '78,717'],
 ['95823', '78,398'],
 ['91709', '78,025'],
 ['90706', '77,852'],
 ['91706', '77,819'],
 ['93257', '76,676'],
 ['91910', '76,250'],
 ['95630', '75,864'],
 ['93550', '75,774'],
 ['90255', '75,636'],
 ['95035', '75,614'],
 ['92553', '74,918'],
 ['94533', '74,833'],
 ['945

In [3]:
populationdf = pd.DataFrame(population)
populationdf.columns = ['zip', 'population']
populationdf.head()

Unnamed: 0,zip,population
0,90011,108051
1,90650,106404
2,91331,105696
3,90201,102878
4,92335,99791


## Transform Starbucks Data
Take the needed Starbucks location information from 'directory.csv' and place it in a pandas dataframe.

In [4]:
starbucks = pd.read_csv('directory.csv')
starbucks = starbucks[['Brand','City','State/Province', 'Country', 'Postcode']]
starbucks.columns = ['brand', 'city', 'state', 'country', 'zip']
starbucks = starbucks[starbucks['country']=='US']
starbucksdf = pd.DataFrame(starbucks['zip'].str.slice(0,5,1).value_counts())
starbucksdf.reset_index(level=0, inplace=True)
starbucksdf.columns = ['zip', 'store_count']
starbucksdf.sort_values('store_count', ascending=False)
starbucksdf.head()

Unnamed: 0,zip,store_count
0,89109,32
1,98101,24
2,90045,21
3,10019,20
4,92101,20


## Scrape and transform income data
Scrape income data from 'http://www.laalmanac.com'

In [5]:
url = 'http://www.laalmanac.com/employment/em12c.php'
response = requests.get(url, 'html.parser')
page_soup = soup(response.text)
income = []
for tr in page_soup.find_all('tr')[1:]:
    tds = tr.find_all('td')
    income.append([tds[0].text.strip(), tds[1].text.strip(), tds[2].text.strip()])

## Merge data
Merge the income and population dataframes on the zipcode into 'demographicsdf'

In [6]:
incomedf = pd.DataFrame(income)
incomedf.columns = ['zip', 'community', 'median_income']
demographicsdf = incomedf.merge(populationdf, on='zip')

In [7]:
demographicsdf.dtypes

zip              object
community        object
median_income    object
population       object
dtype: object

In [8]:
demographicsdf.head()

Unnamed: 0,zip,community,median_income,population
0,90001,"Los Angeles (South Los Angeles), Florence-Graham","$35,660",58738
1,90002,"Los Angeles (Southeast Los Angeles, Watts)","$34,000",52856
2,90003,"Los Angeles (South Los Angeles, Southeast Los ...","$34,397",70490
3,90004,"Los Angeles (Hancock Park, Rampart Village, Vi...","$46,581",62733
4,90005,"Los Angeles (Hancock Park, Koreatown, Wilshire...","$32,461",39562


## Merge Data
Merge starbucksdf and demographicsdf into finaldf on the zipcode

In [9]:
finaldf = demographicsdf.merge(starbucksdf, on='zip')

In [10]:
finaldf.head()

Unnamed: 0,zip,community,median_income,population,store_count
0,90001,"Los Angeles (South Los Angeles), Florence-Graham","$35,660",58738,2
1,90004,"Los Angeles (Hancock Park, Rampart Village, Vi...","$46,581",62733,1
2,90007,"Los Angeles (Southeast Los Angeles, Univerity ...","$23,070",41221,3
3,90010,"Los Angeles (Hancock Park, Wilshire Center, Wi...","$47,115",3759,3
4,90012,"Los Angeles (Downtown Civic Center, Chinatown,...","$38,786",33783,7
