## Extract

In [20]:
import warnings
warnings.filterwarnings('ignore')

# Dependencies
import pandas as pd
import requests
from census import Census
from citipy import citipy
import gmaps
import os
import pymongo

# Census API Key
from config import (census_api_key, g_key)
c = Census(census_api_key, year=2019)

In [6]:
# Run Census Search to retrieve data on all zip codes (2013 ACS5 Census)
# See: https://github.com/datamade/census for library documentation
# See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels
census_data = c.acs5.get(("NAME", "B19013_001E", "B01002_001E", "B25077_001E","B01003_001E","B15003_022E","B15003_023E"), {'for': 'zip code tabulation area:*'})

# Convert to DataFrame
census_pd = pd.DataFrame(census_data)

# Column Reordering
census_pd = census_pd.rename(columns={"B19013_001E": "Median Household Income",
                                      "B01002_001E": "Median Age",
                                      "B25077_001E": "Median Home Value",
                                      "B01003_001E": "Population",
                                      "B15003_022E": "Number of People who hold a Bachelors",
                                      "B15003_023E": "Number of People who hold a Masters",
                                      "NAME": "Name", "zip code tabulation area": "Zipcode"})

# Add in Bachelors Rate (Number of People who hold a Bachelors / Population)
census_pd["Bachelors Rate"] = 100 * \
    census_pd["Number of People who hold a Bachelors"].astype(
        int) / census_pd["Population"].astype(int)

# Add in Masters Rate (Number of People who hold a Masters / Population)
census_pd["Masters Rate"] = 100 * \
    census_pd["Number of People who hold a Masters"].astype(
        int) / census_pd["Population"].astype(int)


# Final DataFrame
census_pd = census_pd[["Zipcode", "Median Household Income", "Median Age", "Median Home Value","Population",
                       "Bachelors Rate","Masters Rate"]]

# Visualize
print(len(census_pd))
# census_pd.to_csv("Resources/census_data.csv", encoding="utf-8", index=False)
census_pd.head()

33120


Unnamed: 0,Zipcode,Median Household Income,Median Age,Median Home Value,Population,Bachelors Rate,Masters Rate
0,601,14361.0,41.9,83900.0,17113.0,10.10343,0.905744
1,602,16807.0,42.9,85300.0,37751.0,10.598395,2.977405
2,603,16049.0,42.1,118400.0,47081.0,11.811559,3.355919
3,606,12119.0,44.3,80800.0,6392.0,3.613892,1.814768
4,610,19898.0,42.7,87600.0,26686.0,10.488646,2.431987


In [7]:
# Add columns for city, latitude and longitude
census_pd["City"] = ""
census_pd["State"] = ""
census_pd["Lat"] = ""
census_pd["Lng"] = ""
census_pd.head()

Unnamed: 0,Zipcode,Median Household Income,Median Age,Median Home Value,Population,Bachelors Rate,Masters Rate,City,State,Lat,Lng
0,601,14361.0,41.9,83900.0,17113.0,10.10343,0.905744,,,,
1,602,16807.0,42.9,85300.0,37751.0,10.598395,2.977405,,,,
2,603,16049.0,42.1,118400.0,47081.0,11.811559,3.355919,,,,
3,606,12119.0,44.3,80800.0,6392.0,3.613892,1.814768,,,,
4,610,19898.0,42.7,87600.0,26686.0,10.488646,2.431987,,,,


In [12]:
# filtering census data to get the Portland zipcodes
portland_census_pd = census_pd.loc[(census_pd["Zipcode"]>="97080") & (census_pd["Zipcode"]<="97267")]
portland_census_pd

Unnamed: 0,Zipcode,Median Household Income,Median Age,Median Home Value,Population,Bachelors Rate,Masters Rate,City,State,Lat,Lng
31924,97080,75431.0,39.1,336900.0,44335.0,12.626593,5.171986,,,,
31925,97086,94031.0,39.8,468300.0,30858.0,19.602696,8.451617,,,,
31926,97089,96744.0,42.4,433000.0,13765.0,13.745005,5.521250,,,,
31927,97101,76397.0,40.8,354300.0,4278.0,8.976157,4.534829,,,,
31928,97102,122708.0,65.4,638500.0,318.0,29.245283,23.584906,,,,
...,...,...,...,...,...,...,...,...,...,...,...
31997,97233,42452.0,32.6,252700.0,40477.0,5.998468,1.531734,,,,
31998,97236,50663.0,35.4,282600.0,40892.0,7.830382,3.318498,,,,
31999,97239,93459.0,39.5,593500.0,18190.0,29.059923,16.234195,,,,
32000,97266,53984.0,36.9,274700.0,35727.0,12.256837,3.761861,,,,


In [14]:
# create a params dict that will be updated with new city each iteration
params = {"key": g_key}
base_url = "https://maps.googleapis.com/maps/api/geocode/json"

count = 0
# Loop through the filtered_small_census_pd and run a lat/long search for each city
for index, row in portland_census_pd.iterrows():
    
#     if count == 15:
#         break

    zipcode = row['Zipcode']

    # update address key value
    params['address'] = f"{zipcode}"

    # make request
    cities_lat_lng = requests.get(base_url, params=params)
    
#     convert to json
    cities_lat_lng = cities_lat_lng.json()
    
    try:
        portland_census_pd.loc[index, "Lat"] = cities_lat_lng["results"][0]["geometry"]["location"]["lat"]
        portland_census_pd.loc[index, "Lng"] = cities_lat_lng["results"][0]["geometry"]["location"]["lng"]
        portland_census_pd.loc[index, "City"] = cities_lat_lng["results"][0]["address_components"][1]["long_name"]
        portland_census_pd.loc[index, "State"] = cities_lat_lng["results"][0]["address_components"][3]["short_name"]
#         print(f"count {count}")
    except (KeyError, IndexError):
        print("Missing field/result... skipping.")
    
    count = count + 1

# Print to csv
# portland_census_pd.to_csv("portland_census_data.csv", encoding="utf-8", index=False)
    
# Visualize to confirm lat lng city appear
portland_census_pd

Unnamed: 0,Zipcode,Median Household Income,Median Age,Median Home Value,Population,Bachelors Rate,Masters Rate,City,State,Lat,Lng
31924,97080,75431.0,39.1,336900.0,44335.0,12.626593,5.171986,Gresham,OR,45.4842,-122.385
31925,97086,94031.0,39.8,468300.0,30858.0,19.602696,8.451617,Portland,OR,45.4433,-122.513
31926,97089,96744.0,42.4,433000.0,13765.0,13.745005,5.521250,Boring,OR,45.4216,-122.45
31927,97101,76397.0,40.8,354300.0,4278.0,8.976157,4.534829,Amity,US,45.0956,-123.169
31928,97102,122708.0,65.4,638500.0,318.0,29.245283,23.584906,Arch Cape,US,45.8253,-123.934
...,...,...,...,...,...,...,...,...,...,...,...
31997,97233,42452.0,32.6,252700.0,40477.0,5.998468,1.531734,Portland,OR,45.5144,-122.505
31998,97236,50663.0,35.4,282600.0,40892.0,7.830382,3.318498,Portland,OR,45.4797,-122.515
31999,97239,93459.0,39.5,593500.0,18190.0,29.059923,16.234195,Southwest Portland,Multnomah County,45.4874,-122.688
32000,97266,53984.0,36.9,274700.0,35727.0,12.256837,3.761861,Portland,OR,45.4864,-122.559


## Transform

In [16]:
# Find all State values that do not equal "OR"
portland_census_pd[(portland_census_pd['State'] != 'OR')]

Unnamed: 0,Zipcode,Median Household Income,Median Age,Median Home Value,Population,Bachelors Rate,Masters Rate,City,State,Lat,Lng
31927,97101,76397.0,40.8,354300.0,4278.0,8.976157,4.534829,Amity,US,45.0956,-123.169
31928,97102,122708.0,65.4,638500.0,318.0,29.245283,23.584906,Arch Cape,US,45.8253,-123.934
31932,97108,46500.0,36.8,297400.0,640.0,7.5,0.0,Beaver,US,45.2393,-123.688
31933,97109,85104.0,45.5,448300.0,538.0,11.152416,1.115242,Buxton,US,45.7491,-123.201
31941,97117,79615.0,50.5,419700.0,719.0,18.21975,0.0,Gales Creek,US,45.6422,-123.3
31943,97119,94904.0,49.1,387400.0,4527.0,11.928429,4.992269,Gaston,US,45.4902,-123.248
31945,97122,41379.0,55.7,251600.0,396.0,7.828283,0.0,Hebo,US,45.1562,-123.779
31946,97123,81774.0,36.1,338600.0,49317.0,13.968814,4.696149,Hillsboro,US,45.4054,-122.991
31952,97131,52552.0,54.2,258400.0,2751.0,10.577972,2.835333,Nehalem,US,45.7279,-123.747
31953,97132,71983.0,35.8,354700.0,29732.0,14.600431,5.936365,Newberg,US,45.3296,-122.965


In [22]:
# Change all State values to OR
portland_census_pd["State"].replace({"US": "OR", "Multnomah County": "OR"}, inplace=True)
portland_census_pd.head()

Unnamed: 0,Zipcode,Median Household Income,Median Age,Median Home Value,Population,Bachelors Rate,Masters Rate,City,State,Lat,Lng
31924,97080,75431.0,39.1,336900.0,44335.0,12.626593,5.171986,Gresham,OR,45.4842,-122.385
31925,97086,94031.0,39.8,468300.0,30858.0,19.602696,8.451617,Portland,OR,45.4433,-122.513
31926,97089,96744.0,42.4,433000.0,13765.0,13.745005,5.52125,Boring,OR,45.4216,-122.45
31927,97101,76397.0,40.8,354300.0,4278.0,8.976157,4.534829,Amity,OR,45.0956,-123.169
31928,97102,122708.0,65.4,638500.0,318.0,29.245283,23.584906,Arch Cape,OR,45.8253,-123.934


## Load

In [21]:
# The default port used by MongoDB is 27017
# https://docs.mongodb.com/manual/reference/default-mongodb-port/
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

# create mongo database
db = client.portland_census_db

# Declare the collection
facts = db.portland_census_db

In [24]:
# Loop through dataframe and load to Mongo Database
for index,row in portland_census_pd.iterrows():
    post = {"Zipcode": portland_census_pd.loc[index,"Zipcode"],
            "Median Household Income": portland_census_pd.loc[index,"Median Household Income"],
            "Median Age": portland_census_pd.loc[index,"Median Age"],
            "Median Home Value": portland_census_pd.loc[index,"Median Home Value"],
            "Population": portland_census_pd.loc[index,"Population"],
            "Bachelors Rate": portland_census_pd.loc[index,"Bachelors Rate"],
            "Masters Rate": portland_census_pd.loc[index,"Masters Rate"],
            "City": portland_census_pd.loc[index,"City"],
            "State": portland_census_pd.loc[index,"State"],
            "Lat": portland_census_pd.loc[index,"Lat"],
            "Lng": portland_census_pd.loc[index,"Lng"],
           }
    facts.insert_one(post)