In [None]:
%matplotlib notebook

In [1]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress
import folium
import json

# Source data location
source_file = 'source_data/housing_data_cleaned.csv'

# Approach adapted from https://towardsdatascience.com/visualizing-data-at-the-zip-code-level-with-folium-d07ac983db20

In [2]:
# Read in source dataset - cleaned file after running 1_housing_etl.ipynb
housing_df = pd.read_csv(source_file,low_memory=False)
housing_df.head()

Unnamed: 0,basements,building_code_description,category_code_description,census_tract,central_air,depth,exempt_building,exempt_land,exterior_condition,fireplaces,...,topography,total_area,total_livable_area,type_heater,unit,view_type,year_built,year_built_estimate,zip_code,zoning
0,D,ROW 3 STY MASONRY,Single Family,241.0,N,67.0,49200,0,4.0,0.0,...,F,938.0,1344.0,A,,I,1895,Y,19144.0,RSA5
1,,RES CONDO 3 STY MAS+OTH,Single Family,337.0,Y,0.0,45000,0,4.0,0.0,...,,0.0,947.0,,B307,I,1970,Y,19152.0,RM2
2,,ROW 2 STY MASONRY,Single Family,201.0,,70.0,0,0,4.0,0.0,...,F,1044.0,1190.0,,,I,1940,Y,19140.0,RM1
3,H,ROW B/GAR 2 STY MASONRY,Single Family,281.0,N,95.5,0,0,4.0,0.0,...,F,1686.53,1633.0,B,,I,1940,Y,19141.0,RSA3
4,,ROW 2 STY MASONRY,Single Family,293.0,,112.5,0,0,4.0,0.0,...,F,2165.62,1320.0,B,,I,1940,Y,19124.0,RSA5


In [4]:
# Aggregate total sales by zip code

# Create df to use for grouping
summary_zip_df = housing_df[['zip_code','sale_price']]

summary_zip_df = summary_zip_df.dropna(how='any')

# Convert to string in main ETL
summary_zip_df['zip_code'] = summary_zip_df['zip_code'].astype('Int64').astype('str')


# Create groupby object
summary_zip_groupby_obj = summary_zip_df.groupby('zip_code')

# Create grouped data frame with count of sales, mean, median sale price
summary_zip_groupby_df = summary_zip_groupby_obj.count()
summary_zip_groupby_df = summary_zip_groupby_df.reset_index()
summary_zip_groupby_df

# Add zip codes 19109 and 19102, so that GeoJSON will work - not in original df but in GeoJSON file
summary_zip_groupby_df = summary_zip_groupby_df.append({'zip_code':'19109','sale_price':0},ignore_index=True)
summary_zip_groupby_df = summary_zip_groupby_df.append({'zip_code':'19112','sale_price':0},ignore_index=True)

#summary_zip_groupby_df.head()
summary_zip_groupby_df['zip_code'].value_counts()

Unnamed: 0,zip_code,sale_price
0,19102,481
1,19103,1501
2,19104,1553
3,19106,1414
4,19107,789
5,19111,3197
6,19114,1755
7,19115,1504
8,19116,1504
9,19118,481


In [5]:
# Load in GeoJSON file for map
with open('source_data/Zipcodes_Poly.geojson','r') as jsonFile:
    data = json.load(jsonFile)
    
# Create folium map
m = folium.Map(location=[39.9526, -75.1652],zoom_start=11)
m.choropleth(
    geo_data=data,
    fill_opacity=0.7,
    line_opacity=0.2,
    data=summary_zip_groupby_df,
    key_on='feature.properties.CODE',
    columns=['zip_code','sale_price'],
    fill_color='YlGnBu',
    legend_name='Number of Home Sales by Zip Code'
)
folium.LayerControl().add_to(m)

m

