## Heatmap
* Count of sales by zip
* Average sales by zip
* Most/Least Expensive sales by zip last 5 years
* Markers including demographic information

In [116]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import folium
import json
import csv
import os

### Bringing in and cleaning data

In [3]:
# Bring in housing dataframe
housingFile = 'source_data/housing_data_cleaned.csv'

# Read in file
housingDF = pd.read_csv(housingFile, low_memory=False)
housingDF.head()

Unnamed: 0,basements,building_code_description,category_code_description,census_tract,central_air,depth,exempt_building,exempt_land,exterior_condition,fireplaces,...,topography,total_area,total_livable_area,type_heater,unit,view_type,year_built,year_built_estimate,zip_code,zoning
0,D,ROW 3 STY MASONRY,Single Family,241.0,N,67.0,49200,0,4.0,0.0,...,F,938.0,1344.0,A,,I,1895,Y,19144.0,RSA5
1,,RES CONDO 3 STY MAS+OTH,Single Family,337.0,Y,0.0,45000,0,4.0,0.0,...,,0.0,947.0,,B307,I,1970,Y,19152.0,RM2
2,,ROW 2 STY MASONRY,Single Family,201.0,,70.0,0,0,4.0,0.0,...,F,1044.0,1190.0,,,I,1940,Y,19140.0,RM1
3,H,ROW B/GAR 2 STY MASONRY,Single Family,281.0,N,95.5,0,0,4.0,0.0,...,F,1686.53,1633.0,B,,I,1940,Y,19141.0,RSA3
4,,ROW 2 STY MASONRY,Single Family,293.0,,112.5,0,0,4.0,0.0,...,F,2165.62,1320.0,B,,I,1940,Y,19124.0,RSA5


In [6]:
# Formatting Date
housingDF['sale_date'].head()

0    44102
1    44102
2    44099
3    44099
4    44098
Name: sale_date, dtype: int64

In [7]:
# Converting
housingDF['sale_date'] = pd.to_datetime(housingDF['sale_date'], unit='D', origin='1899-12-30')
housingDF['sale_date'].head()

0   2020-09-28
1   2020-09-28
2   2020-09-25
3   2020-09-25
4   2020-09-24
Name: sale_date, dtype: datetime64[ns]

In [4]:
# Bring in demographics dataframe
demographicsFile = 'source_data/philly_demographics.csv'

# Read in file
demographicsDF = pd.read_csv(demographicsFile)
demographicsDF.head()

Unnamed: 0,City,Zip,NABE,Lat,Long,MEDINC,P_WHITE,P_BLACK,P_HISP,P_ASIAN,P_OTHERS,P_MinCOMBINED
0,Philadelphia,19102,Center City,39.948498,-75.16683,90750,0.73,0.04,0.05,0.16,0.01,0.27
1,Philadelphia,19103,Center City West,39.953663,-75.17399,73611,0.72,0.06,0.06,0.12,0.04,0.28
2,Philadelphia,19103,Center City West,39.95386,-75.16713,73611,0.72,0.06,0.06,0.12,0.04,0.28
3,Philadelphia,19104,"University City, Mantua, Powelton",39.956417,-75.20855,25865,0.35,0.42,0.05,0.14,0.04,0.65
4,Philadelphia,19106,"Old City, Society Hill",39.94912,-75.14397,109393,0.77,0.08,0.06,0.06,0.02,0.23


In [8]:
# Make the name of the zip column the same
demographicsDF_update = demographicsDF.rename(columns={'Zip': 'zip_code'})
demographicsDF_update.head()

Unnamed: 0,City,zip_code,NABE,Lat,Long,MEDINC,P_WHITE,P_BLACK,P_HISP,P_ASIAN,P_OTHERS,P_MinCOMBINED
0,Philadelphia,19102,Center City,39.948498,-75.16683,90750,0.73,0.04,0.05,0.16,0.01,0.27
1,Philadelphia,19103,Center City West,39.953663,-75.17399,73611,0.72,0.06,0.06,0.12,0.04,0.28
2,Philadelphia,19103,Center City West,39.95386,-75.16713,73611,0.72,0.06,0.06,0.12,0.04,0.28
3,Philadelphia,19104,"University City, Mantua, Powelton",39.956417,-75.20855,25865,0.35,0.42,0.05,0.14,0.04,0.65
4,Philadelphia,19106,"Old City, Society Hill",39.94912,-75.14397,109393,0.77,0.08,0.06,0.06,0.02,0.23


In [9]:
# Merge dataframes on column (zip)
mergedDF = pd.merge(housingDF, demographicsDF_update, on='zip_code', how='inner')
mergedDF

Unnamed: 0,basements,building_code_description,category_code_description,census_tract,central_air,depth,exempt_building,exempt_land,exterior_condition,fireplaces,...,NABE,Lat,Long,MEDINC,P_WHITE,P_BLACK,P_HISP,P_ASIAN,P_OTHERS,P_MinCOMBINED
0,D,ROW 3 STY MASONRY,Single Family,241.0,N,67.00,49200,0,4.0,0.0,...,Germantown,40.033259,-75.177865,33786,0.16,0.75,0.04,0.02,0.03,0.84
1,D,ROW 3 STY MASONRY,Single Family,241.0,N,67.00,49200,0,4.0,0.0,...,Germantown,40.030370,-75.165700,33786,0.16,0.75,0.04,0.02,0.03,0.84
2,C,SEMI/DET 2.5 STY MASONRY,Single Family,252.0,N,96.74,0,0,7.0,0.0,...,Germantown,40.033259,-75.177865,33786,0.16,0.75,0.04,0.02,0.03,0.84
3,C,SEMI/DET 2.5 STY MASONRY,Single Family,252.0,N,96.74,0,0,7.0,0.0,...,Germantown,40.030370,-75.165700,33786,0.16,0.75,0.04,0.02,0.03,0.84
4,,SEMI/DET 2 STY MASONRY,Single Family,242.0,,85.70,0,0,4.0,0.0,...,Germantown,40.033259,-75.177865,33786,0.16,0.75,0.04,0.02,0.03,0.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130845,,APTS 5-50 UNITS MASONRY,Multi Family,8.0,,161.50,0,0,4.0,0.0,...,Center City,39.948498,-75.166830,90750,0.73,0.04,0.05,0.16,0.01,0.27
130846,0,RES CONDO 5+ STY MASONRY,Single Family,4.0,Y,0.00,568911,0,3.0,0.0,...,Center City,39.948498,-75.166830,90750,0.73,0.04,0.05,0.16,0.01,0.27
130847,,RES CONDO 5+ STY MASONRY,Single Family,4.0,Y,0.00,388995,0,3.0,0.0,...,Center City,39.948498,-75.166830,90750,0.73,0.04,0.05,0.16,0.01,0.27
130848,0,RES CONDO 5+ STY MAS+OTH,Single Family,4.0,Y,0.00,0,0,1.0,0.0,...,Center City,39.948498,-75.166830,90750,0.73,0.04,0.05,0.16,0.01,0.27


In [10]:
print(mergedDF.dtypes)

basements                            object
building_code_description            object
category_code_description            object
census_tract                        float64
central_air                          object
depth                               float64
exempt_building                       int64
exempt_land                           int64
exterior_condition                  float64
fireplaces                          float64
frontage                            float64
fuel                                 object
garage_spaces                       float64
garage_type                          object
geographic_ward                     float64
interior_condition                  float64
location                             object
market_value                          int64
market_value_date                   float64
number_of_bathrooms                 float64
number_of_bedrooms                  float64
number_of_rooms                     float64
number_stories                  

### Count of sales by zip
* This explores the total number of home sales by zip code
* Across all dates in the file
* This will show overtime, where most homes are sold

In [108]:
# Aggregate total sales by zip code

# Create df to use for grouping
salesCount = mergedDF[['zip_code','sale_price']]

salesCount = salesCount.dropna(how='any')

# Convert to string in main ETL
salesCount['zip_code'] = salesCount['zip_code'].astype('Int64').astype('str')


# Create groupby object
salesCount_groupby = salesCount.groupby('zip_code')

# Create grouped data frame with count of sales, mean, median sale price
salesCount_groupbyDF = salesCount_groupby.count()
salesCount_groupbyDF = salesCount_groupbyDF.reset_index()
salesCount_groupbyDF

# Add zip codes 19109 and 19102, so that GeoJSON will work - not in original df but in GeoJSON file
salesCount_groupbyDF = salesCount_groupbyDF.append({'zip_code':'19109','sale_price':0},ignore_index=True)
salesCount_groupbyDF = salesCount_groupbyDF.append({'zip_code':'19112','sale_price':0},ignore_index=True)

#salesCount_groupbyDF['zip_code'].value_counts()
salesCount_groupbyDF.head()

Unnamed: 0,zip_code,sale_price
0,19102,481
1,19103,3002
2,19104,1553
3,19106,1414
4,19107,1578


In [120]:
# locate file in directory
zipGeo = r'source_data/Zipcodes_Poly.geojson'

# Create base map
salesCount_map = folium.Map(location = [39.9526, -75.1652], zoom = 11)

# Generate map
choropleth = folium.Choropleth(
    geo_data = zipGeo,
    data = salesCount_groupbyDF,
    columns = ['zip_code','sale_price'],
    key_on = 'feature.properties.CODE',
    legend_name = 'Number of Home Sales by Zip Code',
    fill_color = 'YlGnBu',
    fill_opacity = 0.7,
    line_opacity = 0.2,
    )

# Add labels to the zips
style_function = "font-size 15px; font-weight: bold"
choropleth.geojson.add_child(
    folium.features.GeoJsonTooltip(['zip_code',], style=style_function, labels=False))    

# Display map
salesCount_map


TypeError: parse_options() got multiple values for keyword argument 'zoom'

In [115]:
#salesCount_groupbyDF['zip_code'].value_counts()
# Load in GeoJSON file for map
#with open('source_data/Zipcodes_Poly.geojson','r') as jsonFile:
   # data = json.load(jsonFile)
    
# Create folium map
#m = folium.Map(location = [39.9526, -75.1652], zoom_start = 11)
#m.choropleth(
#    geo_data = data,
#    fill_opacity = 0.7,
#    line_opacity = 0.2,
#    data = salesCount_groupbyDF,
#    key_on = 'feature.properties.CODE',
#    columns = ['zip_code','sale_price'],
#    fill_color = 'YlGnBu',
#    legend_name = 'Number of Home Sales by Zip Code'
#)

# Add labels to the zips
#style_function = "font-size 15px; font-weight: bold"
#choropleth.geojson.add_child(
#    folium.features.GeoJsonTooltip(['zip_code',], style=style_function, labels=False))    

#folium.LayerControl().add_to(m)

# Display map
#m

JSONDecodeError: Expecting value: line 7 column 1 (char 6)

### Average of sales by zip
* This explores the average sale price by zip code
* Across all dates in the file
* This will show overtime the average price of a residence sold

In [106]:
# Aggregate total sales by zip code

# Create df to use for grouping
salesAvg = mergedDF[['zip_code','sale_price']]

salesAvg = salesAvg.dropna(how='any')

# Convert to string in main ETL
salesAvg['zip_code'] = salesAvg['zip_code'].astype('Int64').astype('str')


# Create groupby object
salesAvg_groupby = salesAvg.groupby('zip_code')

# Create grouped data frame with count of sales, mean, median sale price
salesAvg_groupbyDF = salesAvg_groupby.mean()
salesAvg_groupbyDF = salesAvg_groupbyDF.reset_index()
salesAvg_groupbyDF

# Add zip codes 19109 and 19102, so that GeoJSON will work - not in original df but in GeoJSON file
salesAvg_groupbyDF = salesAvg_groupbyDF.append({'zip_code':'19109','sale_price':0},ignore_index=True)
salesAvg_groupbyDF = salesAvg_groupbyDF.append({'zip_code':'19112','sale_price':0},ignore_index=True)

salesAvg_groupbyDF.head()

Unnamed: 0,zip_code,sale_price
0,19102,1052107.0
1,19103,1102820.0
2,19104,481224.1
3,19106,892916.4
4,19107,820395.9


### Most/Least Expensive sales by zip last 5 years
* Exploring the most and least expensive sales by zip code over 5 years
* 2016, 2017, 2018, 2019 and 2020
* This can show a trend in housing sales over time

------
* 2016

In [100]:
## MAX VALUES
# Aggregate total sales by date

# Create df to use for grouping
saleDate = mergedDF[['zip_code', 'sale_date','sale_price']]

saleDate = saleDate.dropna(how='any')

# Convert to string in main ETL
saleDate['zip_code'] = saleDate['zip_code'].astype('Int64').astype('str')

# Filter dataframe (2016 - 2020)
saleDate2016 = saleDate[(saleDate['sale_date'] >= '2015-12-31') & (saleDate['sale_date'] <= '2016-12-31')]

# Create groupby object
saleDate2016_groupby = saleDate2016.groupby('zip_code')

# Create grouped data frame with count of sales, mean, median sale price
saleDate2016_groupbyDF = saleDate2016_groupby.max()
saleDate2016_groupbyDF = saleDate2016_groupbyDF.reset_index()
saleDate2016_groupbyDF

# Add zip codes 19109 and 19102, so that GeoJSON will work - not in original df but in GeoJSON file
saleDate2016_groupbyDF = saleDate2016_groupbyDF.append({'zip_code':'19109','sale_price':0},ignore_index=True)
saleDate2016_groupbyDF = saleDate2016_groupbyDF.append({'zip_code':'19112','sale_price':0},ignore_index=True)

saleDate2016_groupbyDF.head()

Unnamed: 0,zip_code,sale_date,sale_price
0,19102,2016-12-27,12000000
1,19103,2016-12-23,63053000
2,19104,2016-12-30,8739670
3,19106,2016-12-29,4306000
4,19107,2016-12-23,4700000


In [99]:
## MIN VALUES
# Aggregate total sales by date

# Create df to use for grouping
saleDate = mergedDF[['zip_code', 'sale_date','sale_price']]

saleDate = saleDate.dropna(how='any')

# Convert to string in main ETL
saleDate['zip_code'] = saleDate['zip_code'].astype('Int64').astype('str')

# Filter dataframe (2016 - 2020)
saleDate2016 = saleDate[(saleDate['sale_date'] >= '2015-12-31') & (saleDate['sale_date'] <= '2016-12-31')]

# Create groupby object
saleDate2016_groupby = saleDate2016.groupby('zip_code')

# Create grouped data frame with count of sales, mean, median sale price
saleDate2016_groupbyDF = saleDate2016_groupby.min()
saleDate2016_groupbyDF = saleDate2016_groupbyDF.reset_index()
saleDate2016_groupbyDF

# Add zip codes 19109 and 19102, so that GeoJSON will work - not in original df but in GeoJSON file
saleDate2016_groupbyDF = saleDate2016_groupbyDF.append({'zip_code':'19109','sale_price':0},ignore_index=True)
saleDate2016_groupbyDF = saleDate2016_groupbyDF.append({'zip_code':'19112','sale_price':0},ignore_index=True)

saleDate2016_groupbyDF.head()

Unnamed: 0,zip_code,sale_date,sale_price
0,19102,2016-01-07,105000
1,19103,2016-01-04,30000
2,19104,2016-01-04,1107
3,19106,2016-01-04,5000
4,19107,2016-01-06,3475


### Most/Least Expensive sales by zip last 5 years
* 2017

In [98]:
## MAX VALUES
# Aggregate total sales by date

# Create df to use for grouping
saleDate = mergedDF[['zip_code', 'sale_date','sale_price']]

saleDate = saleDate.dropna(how='any')

# Convert to string in main ETL
saleDate['zip_code'] = saleDate['zip_code'].astype('Int64').astype('str')

# Filter dataframe (2016 - 2020)
saleDate2017 = saleDate[(saleDate['sale_date'] > '2016-12-31') & (saleDate['sale_date'] <= '2017-12-31')]

# Create groupby object
saleDate2017_groupby = saleDate2017.groupby('zip_code')

# Create grouped data frame with count of sales, mean, median sale price
saleDate2017_groupbyDF = saleDate2017_groupby.max()
saleDate2017_groupbyDF = saleDate2017_groupbyDF.reset_index()
saleDate2017_groupbyDF

# Add zip codes 19109 and 19102, so that GeoJSON will work - not in original df but in GeoJSON file
saleDate2017_groupbyDF = saleDate2017_groupbyDF.append({'zip_code':'19109','sale_price':0},ignore_index=True)
saleDate2017_groupbyDF = saleDate2017_groupbyDF.append({'zip_code':'19112','sale_price':0},ignore_index=True)

saleDate2017_groupbyDF.head()

Unnamed: 0,zip_code,sale_date,sale_price
0,19102,2017-12-20,6231000
1,19103,2017-12-29,12288800
2,19104,2017-12-28,6690000
3,19106,2017-12-30,17034760
4,19107,2017-12-29,12288800


In [97]:
## MIN VALUES
# Aggregate total sales by date

# Create df to use for grouping
saleDate = mergedDF[['zip_code', 'sale_date','sale_price']]

saleDate = saleDate.dropna(how='any')

# Convert to string in main ETL
saleDate['zip_code'] = saleDate['zip_code'].astype('Int64').astype('str')

# Filter dataframe (2016 - 2020)
saleDate2017 = saleDate[(saleDate['sale_date'] > '2016-12-31') & (saleDate['sale_date'] <= '2017-12-31')]

# Create groupby object
saleDate2017_groupby = saleDate2017.groupby('zip_code')

# Create grouped data frame with count of sales, mean, median sale price
saleDate2017_groupbyDF = saleDate2017_groupby.min()
saleDate2017_groupbyDF = saleDate2017_groupbyDF.reset_index()
saleDate2017_groupbyDF

# Add zip codes 19109 and 19102, so that GeoJSON will work - not in original df but in GeoJSON file
saleDate2017_groupbyDF = saleDate2017_groupbyDF.append({'zip_code':'19109','sale_price':0},ignore_index=True)
saleDate2017_groupbyDF = saleDate2017_groupbyDF.append({'zip_code':'19112','sale_price':0},ignore_index=True)

saleDate2017_groupbyDF.head()

Unnamed: 0,zip_code,sale_date,sale_price
0,19102,2017-01-09,190000
1,19103,2017-01-09,45000
2,19104,2017-01-01,2500
3,19106,2017-01-05,12000
4,19107,2017-01-03,50000


### Most/Least Expensive sales by zip last 5 years
* 2018

In [96]:
## MAX VALUES
# Aggregate total sales by date

# Create df to use for grouping
saleDate = mergedDF[['zip_code', 'sale_date','sale_price']]

saleDate = saleDate.dropna(how='any')

# Convert to string in main ETL
saleDate['zip_code'] = saleDate['zip_code'].astype('Int64').astype('str')

# Filter dataframe (2016 - 2020)
saleDate2018 = saleDate[(saleDate['sale_date'] > '2017-12-31') & (saleDate['sale_date'] <= '2018-12-31')]

# Create groupby object
saleDate2018_groupby = saleDate2018.groupby('zip_code')

# Create grouped data frame with count of sales, mean, median sale price
saleDate2018_groupbyDF = saleDate2018_groupby.max()
saleDate2018_groupbyDF = saleDate2018_groupbyDF.reset_index()
saleDate2018_groupbyDF

# Add zip codes 19109 and 19102, so that GeoJSON will work - not in original df but in GeoJSON file
saleDate2018_groupbyDF = saleDate2018_groupbyDF.append({'zip_code':'19109','sale_price':0},ignore_index=True)
saleDate2018_groupbyDF = saleDate2018_groupbyDF.append({'zip_code':'19112','sale_price':0},ignore_index=True)

saleDate2018_groupbyDF.head()

Unnamed: 0,zip_code,sale_date,sale_price
0,19102,2018-12-24,71463435
1,19103,2018-12-28,59800000
2,19104,2018-12-31,13900000
3,19106,2018-12-28,95500000
4,19107,2018-12-28,16100000


In [95]:
## MIN VALUES
# Aggregate total sales by date

# Create df to use for grouping
saleDate = mergedDF[['zip_code', 'sale_date','sale_price']]

saleDate = saleDate.dropna(how='any')

# Convert to string in main ETL
saleDate['zip_code'] = saleDate['zip_code'].astype('Int64').astype('str')

# Filter dataframe (2016 - 2020)
saleDate2018 = saleDate[(saleDate['sale_date'] > '2017-12-31') & (saleDate['sale_date'] <= '2018-12-31')]

# Create groupby object
saleDate2018_groupby = saleDate2018.groupby('zip_code')

# Create grouped data frame with count of sales, mean, median sale price
saleDate2018_groupbyDF = saleDate2018_groupby.min()
saleDate2018_groupbyDF = saleDate2018_groupbyDF.reset_index()
saleDate2018_groupbyDF

# Add zip codes 19109 and 19102, so that GeoJSON will work - not in original df but in GeoJSON file
saleDate2018_groupbyDF = saleDate2018_groupbyDF.append({'zip_code':'19109','sale_price':0},ignore_index=True)
saleDate2018_groupbyDF = saleDate2018_groupbyDF.append({'zip_code':'19112','sale_price':0},ignore_index=True)

saleDate2018_groupbyDF.head()

Unnamed: 0,zip_code,sale_date,sale_price
0,19102,2018-01-03,155000
1,19103,2018-01-03,20000
2,19104,2018-01-02,1000
3,19106,2018-01-02,10000
4,19107,2018-01-03,47500


### Most/Least Expensive sales by zip last 5 years
* 2019

In [110]:
## MAX VALUES
# Aggregate total sales by date

# Create df to use for grouping
saleDate = mergedDF[['zip_code', 'sale_date','sale_price']]

saleDate = saleDate.dropna(how='any')

# Convert to string in main ETL
saleDate['zip_code'] = saleDate['zip_code'].astype('Int64').astype('str')

# Filter dataframe (2016 - 2020)
saleDate2019 = saleDate[(saleDate['sale_date'] > '2018-12-31') & (saleDate['sale_date'] <= '2019-12-31')]

# Create groupby object
saleDate2019_groupby = saleDate2019.groupby('zip_code')

# Create grouped data frame with count of sales, mean, median sale price
saleDate2019_groupbyDF = saleDate2019_groupby.max()
saleDate2019_groupbyDF = saleDate2019_groupbyDF.reset_index()
saleDate2019_groupbyDF

# Add zip codes 19109 and 19102, so that GeoJSON will work - not in original df but in GeoJSON file
saleDate2019_groupbyDF = saleDate2019_groupbyDF.append({'zip_code':'19109','sale_price':0},ignore_index=True)
saleDate2019_groupbyDF = saleDate2019_groupbyDF.append({'zip_code':'19112','sale_price':0},ignore_index=True)

saleDate2019_groupbyDF.head()

Unnamed: 0,zip_code,sale_date,sale_price
0,19102,2019-12-30,11074000
1,19103,2019-12-30,60500000
2,19104,2019-12-30,19100000
3,19106,2019-12-30,20130000
4,19107,2019-12-30,29650000


In [103]:
## MIN VALUES
# Aggregate total sales by date

# Create df to use for grouping
saleDate = mergedDF[['zip_code', 'sale_date','sale_price']]

saleDate = saleDate.dropna(how='any')

# Convert to string in main ETL
saleDate['zip_code'] = saleDate['zip_code'].astype('Int64').astype('str')

# Filter dataframe (2016 - 2020)
saleDate2019 = saleDate[(saleDate['sale_date'] > '2018-12-31') & (saleDate['sale_date'] <= '2019-12-31')]

# Create groupby object
saleDate2019_groupby = saleDate2019.groupby('zip_code')

# Create grouped data frame with count of sales, mean, median sale price
saleDate2019_groupbyDF = saleDate2019_groupby.min()
saleDate2019_groupbyDF = saleDate2019_groupbyDF.reset_index()
saleDate2019_groupbyDF

# Add zip codes 19109 and 19102, so that GeoJSON will work - not in original df but in GeoJSON file
saleDate2019_groupbyDF = saleDate2019_groupbyDF.append({'zip_code':'19109','sale_price':0},ignore_index=True)
saleDate2019_groupbyDF = saleDate2019_groupbyDF.append({'zip_code':'19112','sale_price':0},ignore_index=True)

saleDate2019_groupbyDF.head()

Unnamed: 0,zip_code,sale_date,sale_price
0,19102,2019-01-03,150000
1,19103,2019-01-03,45000
2,19104,2019-01-01,2000
3,19106,2019-01-02,45005
4,19107,2019-01-04,56000


### Most/Least Expensive sales by zip last 5 years
* 2020

In [105]:
## MAX VALUES
# Aggregate total sales by date

# Create df to use for grouping
saleDate = mergedDF[['zip_code', 'sale_date','sale_price']]

saleDate = saleDate.dropna(how='any')

# Convert to string in main ETL
saleDate['zip_code'] = saleDate['zip_code'].astype('Int64').astype('str')

# Filter dataframe (2016 - 2020)
saleDate2020 = saleDate[(saleDate['sale_date'] > '2019-12-31') & (saleDate['sale_date'] <= '2020-12-31')]

# Create groupby object
saleDate2020_groupby = saleDate2020.groupby('zip_code')

# Create grouped data frame with count of sales, mean, median sale price
saleDate2020_groupbyDF = saleDate2020_groupby.max()
saleDate2020_groupbyDF = saleDate2020_groupbyDF.reset_index()
saleDate2020_groupbyDF

# Add zip codes 19109 and 19102, so that GeoJSON will work - not in original df but in GeoJSON file
saleDate2020_groupbyDF = saleDate2020_groupbyDF.append({'zip_code':'19109','sale_price':0},ignore_index=True)
saleDate2020_groupbyDF = saleDate2020_groupbyDF.append({'zip_code':'19112','sale_price':0},ignore_index=True)

saleDate2020_groupbyDF.head()

Unnamed: 0,zip_code,sale_date,sale_price
0,19102,2020-09-10,2900000
1,19103,2020-09-14,5070000
2,19104,2020-09-22,21500000
3,19106,2020-09-16,2100000
4,19107,2020-09-18,4060000


In [104]:
## MIN VALUES
# Aggregate total sales by date

# Create df to use for grouping
saleDate = mergedDF[['zip_code', 'sale_date','sale_price']]

saleDate = saleDate.dropna(how='any')

# Convert to string in main ETL
saleDate['zip_code'] = saleDate['zip_code'].astype('Int64').astype('str')

# Filter dataframe (2016 - 2020)
saleDate2020 = saleDate[(saleDate['sale_date'] > '2019-12-31') & (saleDate['sale_date'] <= '2020-12-31')]

# Create groupby object
saleDate2020_groupby = saleDate2020.groupby('zip_code')

# Create grouped data frame with count of sales, mean, median sale price
saleDate2020_groupbyDF = saleDate2020_groupby.min()
saleDate2020_groupbyDF = saleDate2020_groupbyDF.reset_index()
saleDate2020_groupbyDF

# Add zip codes 19109 and 19102, so that GeoJSON will work - not in original df but in GeoJSON file
saleDate2020_groupbyDF = saleDate2020_groupbyDF.append({'zip_code':'19109','sale_price':0},ignore_index=True)
saleDate2020_groupbyDF = saleDate2020_groupbyDF.append({'zip_code':'19112','sale_price':0},ignore_index=True)

saleDate2020_groupbyDF.head()

Unnamed: 0,zip_code,sale_date,sale_price
0,19102,2020-01-07,150000
1,19103,2020-01-03,95000
2,19104,2020-01-03,1500
3,19106,2020-01-02,170000
4,19107,2020-01-03,67995


## FInal observations
* xx