## Importing dependencies and environmental variables

In [3]:
# Dependencies
import csv as csv
import http.client 
import json
import numpy as np
import os
import pandas as pd
import pprint
import psycopg2
import requests
import sqlalchemy
from bs4 import BeautifulSoup as bs
from dotenv import load_dotenv
from functools import reduce
from selenium import webdriver
from sodapy import Socrata
from splinter import Browser
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, Float, Date

In [4]:
# Loading homeowrk7.env so that the environment variables can be used in the engine object url
load_dotenv("project3.env")
# Initializing variables to hold each environmet varaible
username=os.environ.get("USERNAME")
password=os.environ.get("PASSWORD")
host=os.environ.get("HOST")
port=os.environ.get("PORT")
database=os.environ.get("DATABASE")
google_key = os.environ.get('GOOGLE_API_KEY')
yelp_client_id = os.environ.get('YELP_CLIENT_ID')
yelp_key = os.environ.get('YELP_API_KEY')
sf_data_key = os.environ.get('SFDATAAPPTOKEN')
acs_5yr_key = os.environ.get('ACS_5YR_KEY')

In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Reading in .csv's so they can be joined

In [6]:
coffee_data = pd.read_csv("../data/coffee_data/complete_coffee_shop_data.csv")
demographic_data = pd.read_csv("../data/demographic_data/total_zip_code_responses.csv")
housing_data = pd.read_csv("../data/housing_data/HousingPrice_ZC.csv")

## Cleaning demographic data to include pct_change of values 

In [5]:
demo_by_zip = demographic_data.sort_values(["zip_code", "year"]).reset_index(drop=True)

In [6]:
for i in range(len(demo_by_zip)):
    if i == 0:
        demo_by_zip.loc[i,"pct_chng_pct_25_34"] = 0
        demo_by_zip.loc[i,"pct_chng_pct_college_deg"] = 0
        demo_by_zip.loc[i,"pct_chng_pct_wht"] = 0
    elif demo_by_zip.loc[i,"zip_code"] != demo_by_zip.loc[i - 1,"zip_code"]:
        demo_by_zip.loc[i,"pct_chng_pct_25_34"] = 0
        demo_by_zip.loc[i,"pct_chng_pct_college_deg"] = 0
        demo_by_zip.loc[i,"pct_chng_pct_wht"] = 0    
    else:
        demo_by_zip.loc[i,"pct_chng_pct_25_34"] = ((demo_by_zip.loc[i,"pct_25_34"] 
                                                    - demo_by_zip.loc[i - 1,"pct_25_34"])
                                                   /demo_by_zip.loc[i - 1,"pct_25_34"])*100
        demo_by_zip.loc[i,"pct_chng_pct_college_deg"] = ((demo_by_zip.loc[i,"pct_college_deg"] 
                                                          - demo_by_zip.loc[i - 1,"pct_college_deg"])
                                                          /demo_by_zip.loc[i - 1,"pct_college_deg"])*100
        demo_by_zip.loc[i,"pct_chng_pct_wht"] = ((demo_by_zip.loc[i,"pct_wht"] 
                                                  - demo_by_zip.loc[i - 1,"pct_wht"])
                                                  /demo_by_zip.loc[i - 1,"pct_wht"])*100

  del sys.path[0]
  app.launch_new_instance()
  del sys.path[0]


In [8]:
demo_by_zip.columns

Index(['with_advanced_deg', 'zip_code', 'total_pop', 'total_24_29_female', 'total_30_34_female', 'total_24_29_male', 'total_30_34_male', 'total_24_29_female_wht', 'total_30_34_female_wht', 'total_24_29_male_wht', 'total_30_34_male_wht', 'total_wht', 'with_bachelors', 'year', 'pct_25_34', 'pct_college_deg', 'pct_wht', 'pct_chng_pct_25_34', 'pct_chng_pct_college_deg', 'pct_chng_pct_wht'], dtype='object')

In [9]:
cleaned_demo_by_zip = demo_by_zip[["year", "zip_code", "pct_wht", "pct_25_34", "pct_college_deg", 
                                   "pct_chng_pct_25_34", "pct_chng_pct_college_deg", "pct_chng_pct_wht"]]

In [10]:
cleaned_demo_by_zip.head(8)

Unnamed: 0,year,zip_code,pct_wht,pct_25_34,pct_college_deg,pct_chng_pct_25_34,pct_chng_pct_college_deg,pct_chng_pct_wht
0,2011,90001,73.73813,17.350256,2.173119,0.0,0.0,0.0
1,2012,90001,69.799927,16.45558,1.941997,-5.156556,-10.635496,-5.340796
2,2013,90001,60.054785,16.868152,1.94485,2.507185,0.146921,-13.961536
3,2014,90001,49.893455,16.367156,2.145115,-2.970073,10.297175,-16.920101
4,2015,90001,42.799727,15.632481,2.439408,-4.488712,13.719222,-14.217751
5,2016,90001,35.916951,15.546581,2.443823,-0.549498,0.180993,-16.081355
6,2017,90001,32.367462,15.453369,2.449862,-0.599565,0.247111,-9.88249
7,2011,90004,35.986673,17.418507,20.941,0.0,0.0,0.0


In [11]:
eleven_pct_chng_demo_data = cleaned_demo_by_zip.loc[cleaned_demo_by_zip["year"] == 2011]
twelve_pct_chng_demo_data = cleaned_demo_by_zip.loc[cleaned_demo_by_zip["year"] == 2012]
thirteen_pct_chng_demo_data = cleaned_demo_by_zip.loc[cleaned_demo_by_zip["year"] == 2013]
fourteen_pct_chng_demo_data = cleaned_demo_by_zip.loc[cleaned_demo_by_zip["year"] == 2014]
fifteen_pct_chng_demo_data = cleaned_demo_by_zip.loc[cleaned_demo_by_zip["year"] == 2015]
sixteen_pct_chng_demo_data = cleaned_demo_by_zip.loc[cleaned_demo_by_zip["year"] == 2016]
seventeen_pct_chng_demo_data = cleaned_demo_by_zip.loc[cleaned_demo_by_zip["year"] == 2017]

In [12]:
seventeen_pct_chng_demo_data.head()

Unnamed: 0,year,zip_code,pct_wht,pct_25_34,pct_college_deg,pct_chng_pct_25_34,pct_chng_pct_college_deg,pct_chng_pct_wht
6,2017,90001,32.367462,15.453369,2.449862,-0.599565,0.247111,-9.88249
13,2017,90004,37.758437,19.728054,24.626592,6.251947,-0.191751,-6.978698
20,2017,90007,29.836734,16.12285,12.022998,0.508037,1.185532,1.789211
27,2017,90010,23.064645,24.953445,42.085661,-14.625301,-1.911462,22.225276
34,2017,90012,28.585383,20.732321,23.375662,1.579633,6.826875,-1.435248


## Cleaning demographic data to only include feature columns

In [79]:
demographic_data.columns

Index(['with_advanced_deg', 'zip_code', 'total_pop', 'total_24_29_female', 'total_30_34_female', 'total_24_29_male', 'total_30_34_male', 'total_24_29_female_wht', 'total_30_34_female_wht', 'total_24_29_male_wht', 'total_30_34_male_wht', 'total_wht', 'with_bachelors', 'year', 'pct_25_34', 'pct_college_deg', 'pct_wht'], dtype='object')

In [80]:
cleaned_demographic_data = demographic_data[["year", "zip_code", "pct_wht", "pct_25_34", "pct_college_deg",]]

In [82]:
eleven_demo_data = cleaned_demographic_data.loc[cleaned_demographic_data["year"] == 2011]
twelve_demo_data = cleaned_demographic_data.loc[cleaned_demographic_data["year"] == 2012]
thirteen_demo_data = cleaned_demographic_data.loc[cleaned_demographic_data["year"] == 2013]
fourteen_demo_data = cleaned_demographic_data.loc[cleaned_demographic_data["year"] == 2014]
fifteen_demo_data = cleaned_demographic_data.loc[cleaned_demographic_data["year"] == 2015]
sixteen_demo_data = cleaned_demographic_data.loc[cleaned_demographic_data["year"] == 2016]
seventeen_demo_data = cleaned_demographic_data.loc[cleaned_demographic_data["year"] == 2017]

In [83]:
seventeen_demo_data.head()

Unnamed: 0,year,zip_code,pct_wht,pct_25_34,pct_college_deg
834,2017,90001,32.367462,15.453369,2.449862
835,2017,90004,37.758437,19.728054,24.626592
836,2017,90007,29.836734,16.12285,12.022998
837,2017,90010,23.064645,24.953445,42.085661
838,2017,90012,28.585383,20.732321,23.375662


## Cleaning coffee data to only include y column

In [13]:
sf_zip_codes = [94102, 94103, 94104, 94105, 94107, 94108, 94109, 94110, 94111, 94112, 94114, 94115, 94116, 94117, 
               94118, 94121, 94122, 94123, 94124, 94127, 94129, 94130, 94131, 94132, 94133, 94134, 94158]

In [14]:
seattle_zip_codes = [98101, 98102, 98103, 98104, 98105, 98106, 98107, 98108, 98109, 98112, 98115, 98116, 98117, 
                     98118, 98119, 98121, 98122, 98124, 98125, 98126, 98133, 98134, 98136, 98144, 98146]

In [15]:
la_zip_codes = [90001, 90004, 90007, 90010, 90012, 90013, 90014, 90015, 90016, 90017, 90018, 90019, 90020, 90021, 
                90023, 90024, 90025, 90026, 90027, 90028, 90029, 90031, 90033, 90034, 90035, 90036, 90038, 90039, 
                90041, 90042, 90045, 90046, 90047, 90048, 90049, 90057, 90064, 90065, 90066, 90067, 90068, 90069, 
                90071, 90077, 90089, 90094, 90230, 90501]

In [16]:
greater_la_zip_codes = [90001, 90004, 90007, 90010, 90012, 90013, 90014, 90015, 90016, 90017, 90018, 90019, 90020, 
                        90021, 90023, 90024, 90025, 90026, 90027, 90028, 90029, 90031, 90033, 90034, 90035, 90036, 
                        90038, 90039, 90041, 90042, 90045, 90046, 90047, 90048, 90049, 90057, 90064, 90065, 90066, 
                        90067, 90068, 90069, 90071, 90077, 90089, 90094, 90230, 90232, 90272, 90291, 90292, 90501, 
                        90710, 90731, 90732, 91040, 91303, 91306, 91307, 91311, 91316, 91324, 91325, 91335, 91342, 
                        91343, 91344, 91345, 91352, 91356, 91364, 91367, 91401, 91402, 91403, 91405, 91406, 91423, 
                        91436, 91601, 91602, 91604, 91605, 91606, 91607, 94608, 94901, 95838]

In [17]:
all_zip_codes = (sf_zip_codes + seattle_zip_codes + greater_la_zip_codes)
all_zip_codes.sort()

In [20]:
def historic_coffee_df_builder(start_year, end_year):
    initial_df = pd.DataFrame({"zip_code": all_zip_codes, "num_coffee_shops": 0})
    for i in range(int(start_year), int(end_year)+1):
        for zip_code in all_zip_codes:
            coffee_shop_open_count = coffee_data["zip_code"][(coffee_data["zip_code"] == zip_code)
                & (coffee_data["location_start_date"] > f"{i}-01-01 00:00:00")
                & (coffee_data["location_start_date"] < f"{i+1}-01-01 00:00:00")].count()
            coffee_shop_closed_count = coffee_data["zip_code"][(coffee_data["zip_code"] == zip_code)
                & (coffee_data["location_end_date"] > f"{i}-01-01 00:00:00")
                & (coffee_data["location_end_date"] < f"{i+1}-01-01 00:00:00")].count()
            coffee_shop_change = coffee_shop_open_count - coffee_shop_closed_count
            initial_df["num_coffee_shops"][initial_df["zip_code"] == zip_code] += coffee_shop_change
    return initial_df

In [21]:
historic_coffee_data = historic_coffee_df_builder("1989", "2010")

In [22]:
def coffee_df_builder(previous_year_df, start_year, end_year):
    new_df = previous_year_df.copy()
    for zip_code in all_zip_codes:
        coffee_shop_open_count = coffee_data["zip_code"][(coffee_data["zip_code"] == zip_code)
            & (coffee_data["location_start_date"] > f"{start_year}-01-01 00:00:00")
            & (coffee_data["location_start_date"] < f"{end_year}-01-01 00:00:00")].count()
        coffee_shop_closed_count = coffee_data["zip_code"][(coffee_data["zip_code"] == zip_code)
            & (coffee_data["location_end_date"] > f"{start_year}-01-01 00:00:00")
            & (coffee_data["location_end_date"] < f"{end_year}-01-01 00:00:00")].count()
        coffee_shop_change = coffee_shop_open_count - coffee_shop_closed_count
        new_df["num_coffee_shops"][new_df["zip_code"] == zip_code] += coffee_shop_change
    return new_df

## Use this is you want to include historic coffee shop counts starting from 1989

In [23]:
eleven_hist_coffee_data = coffee_df_builder(historic_coffee_data, "2011", "2012")
twelve_hist_coffee_data = coffee_df_builder(eleven_hist_coffee_data, "2012", "2013")
thirteen_hist_coffee_data = coffee_df_builder(twelve_hist_coffee_data, "2013", "2014")
fourteen_hist_coffee_data = coffee_df_builder(thirteen_hist_coffee_data, "2014", "2015")
fifteen_hist_coffee_data = coffee_df_builder(fourteen_hist_coffee_data, "2015", "2016")
sixteen_hist_coffee_data = coffee_df_builder(fifteen_hist_coffee_data, "2016", "2017")
seventeen_hist_coffee_data = coffee_df_builder(sixteen_hist_coffee_data, "2017", "2018")

In [24]:
seventeen_hist_coffee_data[seventeen_hist_coffee_data["zip_code"] == 94103]

Unnamed: 0,zip_code,num_coffee_shops
86,94103,24


## Use this is you only want to include coffee shop counts from the start of the when demographic data is available

In [25]:
initial_df = pd.DataFrame({"zip_code": all_zip_codes, "num_coffee_shops": 0})
eleven_coffee_data = coffee_df_builder(initial_df, "2011", "2012")
twelve_coffee_data = coffee_df_builder(eleven_coffee_data, "2012", "2013")
thirteen_coffee_data = coffee_df_builder(twelve_coffee_data, "2013", "2014")
fourteen_coffee_data = coffee_df_builder(thirteen_coffee_data, "2014", "2015")
fifteen_coffee_data = coffee_df_builder(fourteen_coffee_data, "2015", "2016")
sixteen_coffee_data = coffee_df_builder(fifteen_coffee_data, "2016", "2017")
seventeen_coffee_data = coffee_df_builder(sixteen_coffee_data, "2017", "2018")

In [26]:
seventeen_coffee_data[seventeen_coffee_data["zip_code"] == 94103]

Unnamed: 0,zip_code,num_coffee_shops
86,94103,6


## Calculating the average median home sale price for each year

In [8]:
year_list = ["2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018"]
month_list = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]

In [9]:
for year in year_list:
    month_sum = 0
    for month in month_list:
        month_sum += housing_data[f"{year}-{month}"]
    housing_data[f"{year}_avg"] = month_sum/12

## Calculating the percent change in median home sale price for each year and building a dataframe for each year

In [33]:
pct_chng_housing_data = pd.DataFrame({"zipcode": housing_data["zipcode"],
                                      "city": housing_data["City"],
                                        "2010_2011_change" : 0,
                                        "2011_2012_change" : ((housing_data["2012_avg"] - housing_data["2011_avg"])
                                                              /housing_data["2011_avg"])*100,
                                        "2012_2013_change" : ((housing_data["2013_avg"] - housing_data["2012_avg"])
                                                              /housing_data["2012_avg"])*100,
                                        "2013_2014_change" : ((housing_data["2014_avg"] - housing_data["2013_avg"])
                                                              /housing_data["2013_avg"])*100,
                                        "2014_2015_change" : ((housing_data["2015_avg"] - housing_data["2014_avg"])
                                                              /housing_data["2014_avg"])*100,
                                        "2015_2016_change" : ((housing_data["2016_avg"] - housing_data["2015_avg"])
                                                              /housing_data["2015_avg"])*100,
                                        "2016_2017_change" : ((housing_data["2017_avg"] - housing_data["2016_avg"])
                                                              /housing_data["2016_avg"])*100,
                                        "2017_2018_change" : ((housing_data["2018_avg"] - housing_data["2017_avg"])
                                                              /housing_data["2017_avg"])*100,
                                        "2011_avg": housing_data["2011_avg"],
                                        "2012_avg": housing_data["2012_avg"],
                                        "2013_avg": housing_data["2013_avg"],
                                        "2014_avg": housing_data["2014_avg"],
                                        "2015_avg": housing_data["2015_avg"],
                                        "2016_avg": housing_data["2016_avg"],
                                        "2017_avg": housing_data["2017_avg"],
                                        "2018_avg": housing_data["2018_avg"],
                                       })
pct_chng_housing_data = pct_chng_housing_data.rename(columns={"zipcode": "zip_code", "City": "city"})

In [34]:
eleven_housing_data = pct_chng_housing_data[["zip_code", "city", "2011_2012_change", "2011_avg", "2012_avg"]]
eleven_housing_data.columns = ["zip_code", "city", "pct_chng_housing_price", "current_year_housing_price", "next_year_housing_price"]
twelve_housing_data = pct_chng_housing_data[["zip_code", "city", "2012_2013_change", "2012_avg", "2013_avg"]]
twelve_housing_data.columns = ["zip_code", "city", "pct_chng_housing_price", "current_year_housing_price", "next_year_housing_price"]
thirteen_housing_data = pct_chng_housing_data[["zip_code", "city", "2013_2014_change", "2013_avg", "2014_avg"]]
thirteen_housing_data.columns = ["zip_code", "city", "pct_chng_housing_price", "current_year_housing_price", "next_year_housing_price"]
fourteen_housing_data = pct_chng_housing_data[["zip_code", "city", "2014_2015_change", "2014_avg", "2015_avg"]]
fourteen_housing_data.columns = ["zip_code", "city", "pct_chng_housing_price", "current_year_housing_price", "next_year_housing_price"]
fifteen_housing_data = pct_chng_housing_data[["zip_code", "city", "2015_2016_change", "2015_avg", "2016_avg"]]
fifteen_housing_data.columns = ["zip_code", "city", "pct_chng_housing_price", "current_year_housing_price", "next_year_housing_price"]
sixteen_housing_data = pct_chng_housing_data[["zip_code", "city", "2016_2017_change", "2016_avg", "2017_avg"]]
sixteen_housing_data.columns = ["zip_code", "city", "pct_chng_housing_price", "current_year_housing_price", "next_year_housing_price"]
seventeen_housing_data = pct_chng_housing_data[["zip_code", "city", "2017_2018_change", "2017_avg", "2018_avg"]]  
seventeen_housing_data.columns = ["zip_code", "city", "pct_chng_housing_price", "current_year_housing_price", "next_year_housing_price"]

##  Joining each year's worth of data together (historic coffee data)

In [36]:
eleven_hist_dfs = [eleven_pct_chng_demo_data, eleven_hist_coffee_data, eleven_housing_data]
eleven_hist_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), eleven_hist_dfs)
twelve_hist_dfs = [twelve_pct_chng_demo_data, twelve_hist_coffee_data, twelve_housing_data]
twelve_hist_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), twelve_hist_dfs)
thirteen_hist_dfs = [thirteen_pct_chng_demo_data, thirteen_hist_coffee_data, thirteen_housing_data]
thirteen_hist_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), thirteen_hist_dfs)
fourteen_hist_dfs = [fourteen_pct_chng_demo_data, fourteen_hist_coffee_data, fourteen_housing_data]
fourteen_hist_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), fourteen_hist_dfs)
fifteen_hist_dfs = [fifteen_pct_chng_demo_data, fifteen_hist_coffee_data, fifteen_housing_data]
fifteen_hist_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), fifteen_hist_dfs)
sixteen_hist_dfs = [sixteen_pct_chng_demo_data, sixteen_hist_coffee_data, sixteen_housing_data]
sixteen_hist_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), sixteen_hist_dfs)
seventeen_hist_dfs = [seventeen_pct_chng_demo_data, seventeen_hist_coffee_data, seventeen_housing_data]
seventeen_hist_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), seventeen_hist_dfs)

In [37]:
seventeen_hist_data[seventeen_hist_data["zip_code"] == 94103]

Unnamed: 0,year,zip_code,pct_wht,pct_25_34,pct_college_deg,pct_chng_pct_25_34,pct_chng_pct_college_deg,pct_chng_pct_wht,num_coffee_shops,city,pct_chng_housing_price,current_year_housing_price,next_year_housing_price
86,2017.0,94103,41.73027,23.660615,40.681734,6.576252,1.811101,-8.551428,24,San Francisco,3.749785,967166.666667,1003433.0


##  Joining each year's worth of data together (non-historic coffee data)

In [38]:
eleven_dfs = [eleven_pct_chng_demo_data, eleven_coffee_data, eleven_housing_data]
eleven_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), eleven_dfs)
twelve_dfs = [twelve_pct_chng_demo_data, twelve_coffee_data, twelve_housing_data]
twelve_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), twelve_dfs)
thirteen_dfs = [thirteen_pct_chng_demo_data, thirteen_coffee_data, thirteen_housing_data]
thirteen_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), thirteen_dfs)
fourteen_dfs = [fourteen_pct_chng_demo_data, fourteen_coffee_data, fourteen_housing_data]
fourteen_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), fourteen_dfs)
fifteen_dfs = [fifteen_pct_chng_demo_data, fifteen_coffee_data, fifteen_housing_data]
fifteen_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), fifteen_dfs)
sixteen_dfs = [sixteen_pct_chng_demo_data, sixteen_coffee_data, sixteen_housing_data]
sixteen_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), sixteen_dfs)
seventeen_dfs = [seventeen_pct_chng_demo_data, seventeen_coffee_data, seventeen_housing_data]
seventeen_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), seventeen_dfs)

In [39]:
seventeen_data[seventeen_data["zip_code"] == 94103]

Unnamed: 0,year,zip_code,pct_wht,pct_25_34,pct_college_deg,pct_chng_pct_25_34,pct_chng_pct_college_deg,pct_chng_pct_wht,num_coffee_shops,city,pct_chng_housing_price,current_year_housing_price,next_year_housing_price
86,2017.0,94103,41.73027,23.660615,40.681734,6.576252,1.811101,-8.551428,6,San Francisco,3.749785,967166.666667,1003433.0


##  Joining each year's worth of data together (no coffee data)

In [40]:
eleven_nocoffee_dfs = [eleven_pct_chng_demo_data, eleven_housing_data]
eleven_nocoffee_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), eleven_nocoffee_dfs)
twelve_nocoffee_dfs = [twelve_pct_chng_demo_data, twelve_housing_data]
twelve_nocoffee_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), twelve_nocoffee_dfs)
thirteen_nocoffee_dfs = [thirteen_pct_chng_demo_data, thirteen_housing_data]
thirteen_nocoffee_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), thirteen_nocoffee_dfs)
fourteen_nocoffee_dfs = [fourteen_pct_chng_demo_data, fourteen_housing_data]
fourteen_nocoffee_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), fourteen_nocoffee_dfs)
fifteen_nocoffee_dfs = [fifteen_pct_chng_demo_data, fifteen_housing_data]
fifteen_nocoffee_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), fifteen_nocoffee_dfs)
sixteen_nocoffee_dfs = [sixteen_pct_chng_demo_data, sixteen_housing_data]
sixteen_nocoffee_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), sixteen_nocoffee_dfs)
seventeen_nocoffee_dfs = [seventeen_pct_chng_demo_data, seventeen_housing_data]
seventeen_nocoffee_data = reduce(lambda  left,right: pd.merge(left,right,on=["zip_code"],
                                            how='outer'), seventeen_nocoffee_dfs)

In [41]:
seventeen_nocoffee_data[seventeen_data["zip_code"] == 94103]

  """Entry point for launching an IPython kernel.


Unnamed: 0,year,zip_code,pct_wht,pct_25_34,pct_college_deg,pct_chng_pct_25_34,pct_chng_pct_college_deg,pct_chng_pct_wht,city,pct_chng_housing_price,current_year_housing_price,next_year_housing_price
86,2017,94103,41.73027,23.660615,40.681734,6.576252,1.811101,-8.551428,San Francisco,3.749785,967166.666667,1003433.0


## Joining data frames together (historic coffee data)

In [42]:
hist_complete_data = pd.concat([eleven_hist_data,
                           twelve_hist_data,
                           thirteen_hist_data,
                           fourteen_hist_data,
                           fifteen_hist_data,
                           sixteen_hist_data,
                           seventeen_hist_data
                          ])
hist_complete_data = hist_complete_data[hist_complete_data["zip_code"] != 98124]
hist_complete_data["year"] = hist_complete_data["year"].astype(int)
hist_complete_data = hist_complete_data.fillna(0)
hist_complete_data.to_csv("../data/complete_data/hist_complete_data.csv", index=False, header=True)

In [47]:
hist_complete_data[hist_complete_data["zip_code"] == 94103] 

Unnamed: 0,year,zip_code,pct_wht,pct_25_34,pct_college_deg,pct_chng_pct_25_34,pct_chng_pct_college_deg,pct_chng_pct_wht,num_coffee_shops,city,pct_chng_housing_price,current_year_housing_price,next_year_housing_price
86,2011,94103,47.326741,20.906639,31.265785,0.0,0.0,0.0,18,San Francisco,11.671337,568058.333333,634358.3
86,2012,94103,47.082689,20.954452,34.146137,0.228698,9.212475,-0.515676,18,San Francisco,14.523863,634358.333333,726491.7
86,2013,94103,46.003263,19.705526,33.262224,-5.960197,-2.588618,-2.292617,17,San Francisco,13.923078,726491.666667,827641.7
86,2014,94103,45.267131,19.279907,35.697694,-2.159894,7.322028,-1.600172,20,San Francisco,14.0117,827641.666667,943608.3
86,2015,94103,44.765313,20.490353,37.911884,6.278279,6.202616,-1.10857,22,San Francisco,1.434211,943608.333333,957141.7
86,2016,94103,45.632501,22.200645,39.958053,8.346812,5.39717,1.937187,24,San Francisco,1.047389,957141.666667,967166.7
86,2017,94103,41.73027,23.660615,40.681734,6.576252,1.811101,-8.551428,24,San Francisco,3.749785,967166.666667,1003433.0


## Joining data frames together (non-historic coffee data)

In [48]:
complete_data = pd.concat([eleven_data,
                           twelve_data,
                           thirteen_data,
                           fourteen_data,
                           fifteen_data,
                           sixteen_data,
                           seventeen_data
                          ])
complete_data = complete_data[complete_data["zip_code"] != 98124]
complete_data["year"] = complete_data["year"].astype(int)
complete_data = complete_data.fillna(0)
complete_data.to_csv("../data/complete_data/complete_data.csv", index=False, header=True)

In [49]:
complete_data[complete_data["zip_code"] == 94103] 

Unnamed: 0,year,zip_code,pct_wht,pct_25_34,pct_college_deg,pct_chng_pct_25_34,pct_chng_pct_college_deg,pct_chng_pct_wht,num_coffee_shops,city,pct_chng_housing_price,current_year_housing_price,next_year_housing_price
86,2011,94103,47.326741,20.906639,31.265785,0.0,0.0,0.0,0,San Francisco,11.671337,568058.333333,634358.3
86,2012,94103,47.082689,20.954452,34.146137,0.228698,9.212475,-0.515676,0,San Francisco,14.523863,634358.333333,726491.7
86,2013,94103,46.003263,19.705526,33.262224,-5.960197,-2.588618,-2.292617,-1,San Francisco,13.923078,726491.666667,827641.7
86,2014,94103,45.267131,19.279907,35.697694,-2.159894,7.322028,-1.600172,2,San Francisco,14.0117,827641.666667,943608.3
86,2015,94103,44.765313,20.490353,37.911884,6.278279,6.202616,-1.10857,4,San Francisco,1.434211,943608.333333,957141.7
86,2016,94103,45.632501,22.200645,39.958053,8.346812,5.39717,1.937187,6,San Francisco,1.047389,957141.666667,967166.7
86,2017,94103,41.73027,23.660615,40.681734,6.576252,1.811101,-8.551428,6,San Francisco,3.749785,967166.666667,1003433.0


## Joining data frames together (no coffee data)

In [50]:
complete_nocoffee_data = pd.concat([eleven_nocoffee_data,
                           twelve_nocoffee_data,
                           thirteen_nocoffee_data,
                           fourteen_nocoffee_data,
                           fifteen_nocoffee_data,
                           sixteen_nocoffee_data,
                           seventeen_nocoffee_data
                          ])
complete_nocoffee_data = complete_nocoffee_data[complete_nocoffee_data["zip_code"] != 98124]
complete_nocoffee_data["year"] = complete_nocoffee_data["year"].astype(int)
complete_nocoffee_data = complete_nocoffee_data.fillna(0)
complete_nocoffee_data.to_csv("../data/complete_data/complete_nocoffee_data.csv", index=False, header=True)

In [51]:
complete_nocoffee_data

Unnamed: 0,year,zip_code,pct_wht,pct_25_34,pct_college_deg,pct_chng_pct_25_34,pct_chng_pct_college_deg,pct_chng_pct_wht,city,pct_chng_housing_price,current_year_housing_price,next_year_housing_price
0,2011,90001,73.738130,17.350256,2.173119,0.000000,0.000000,0.000000,Florence-Graham,2.591934,1.781167e+05,1.827333e+05
1,2011,90004,35.986673,17.418507,20.941000,0.000000,0.000000,0.000000,Los Angeles,-5.366014,7.364250e+05,6.969083e+05
2,2011,90007,34.894120,15.937537,8.200639,0.000000,0.000000,0.000000,Los Angeles,2.194292,3.182500e+05,3.252333e+05
3,2011,90010,22.081140,17.095076,40.941468,0.000000,0.000000,0.000000,0,0.000000,0.000000e+00,0.000000e+00
4,2011,90012,26.035907,17.755478,15.164858,0.000000,0.000000,0.000000,Los Angeles,-1.844743,3.017583e+05,2.961917e+05
5,2011,90013,37.074002,13.838851,23.064752,0.000000,0.000000,0.000000,Los Angeles,0.000000,0.000000e+00,0.000000e+00
6,2011,90014,37.317448,17.505765,19.427364,0.000000,0.000000,0.000000,Los Angeles,0.000000,0.000000e+00,0.000000e+00
7,2011,90015,32.650220,17.792504,11.549813,0.000000,0.000000,0.000000,Los Angeles,0.998564,3.772083e+05,3.809750e+05
8,2011,90016,20.778306,15.214098,9.370400,0.000000,0.000000,0.000000,Los Angeles,-0.887217,2.968083e+05,2.941750e+05
9,2011,90017,32.083389,21.882952,9.562073,0.000000,0.000000,0.000000,Los Angeles,2.703602,3.254917e+05,3.342917e+05
