In [4]:
%matplotlib notebook

In [5]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt

# Source data location
source_file = 'source_data/housing_data_cleaned.csv'

In [7]:
# Read in source dataset - cleaned file after running 1_housing_etl.ipynb
housing_df = pd.read_csv(source_file,low_memory=False)
housing_df.head()

Unnamed: 0,basements,building_code_description,category_code_description,census_tract,central_air,depth,exempt_building,exempt_land,exterior_condition,fireplaces,...,topography,total_area,total_livable_area,type_heater,unit,view_type,year_built,year_built_estimate,zip_code,zoning
0,D,ROW 3 STY MASONRY,Single Family,241.0,N,67.0,49200,0,4.0,0.0,...,F,938.0,1344.0,A,,I,1895,Y,19144.0,RSA5
1,,RES CONDO 3 STY MAS+OTH,Single Family,337.0,Y,0.0,45000,0,4.0,0.0,...,,0.0,947.0,,B307,I,1970,Y,19152.0,RM2
2,,ROW 2 STY MASONRY,Single Family,201.0,,70.0,0,0,4.0,0.0,...,F,1044.0,1190.0,,,I,1940,Y,19140.0,RM1
3,H,ROW B/GAR 2 STY MASONRY,Single Family,281.0,N,95.5,0,0,4.0,0.0,...,F,1686.53,1633.0,B,,I,1940,Y,19141.0,RSA3
4,,ROW 2 STY MASONRY,Single Family,293.0,,112.5,0,0,4.0,0.0,...,F,2165.62,1320.0,B,,I,1940,Y,19124.0,RSA5


In [8]:
housing_df['sale_date'].head()

0    44102
1    44102
2    44099
3    44099
4    44098
Name: sale_date, dtype: int64

In [9]:
# Need to convert sale date into readable format
# NOTE: approach taken from https://stackoverflow.com/questions/38454403/convert-excel-style-date-with-pandas
# Add into 1_housing_etl notebook for all date fields

housing_df['sale_date'] = pd.to_datetime(housing_df['sale_date'], unit='D', origin='1899-12-30')
housing_df['sale_date'].head()

0   2020-09-28
1   2020-09-28
2   2020-09-25
3   2020-09-25
4   2020-09-24
Name: sale_date, dtype: datetime64[ns]

In [11]:
# Create column for year of sale
# NOTE: approach taken from https://stackoverflow.com/questions/25146121/extracting-just-month-and-year-separately-from-pandas-datetime-column
housing_df['sale_year'] = pd.DatetimeIndex(housing_df['sale_date']).year
housing_df['sale_year'].value_counts()

2018    23224
2019    22889
2017    22818
2016    20724
2020     9824
Name: sale_year, dtype: int64

In [30]:
# Create condensed df with just sale price and year to create summary statistics table
summary_df = housing_df[['sale_price','sale_year']]
summary_df = summary_df.rename(columns={'sale_year':'Year'})

# Create groupby object, group by sale year
summary_df_groupby_obj = summary_df.groupby('Year')

# Create summarys stats table
summary_table_df = summary_df_groupby_obj.agg(['count','mean','median','std'])

# Rename columns

summary_table_df.columns = ['Total Number of Sales','Mean Sale Price','Median Sale Price','Standard Deviation of Sale Price']

# Reformat data
summary_table_df['Total Number of Sales'] = summary_table_df['Total Number of Sales'].map('{:,.0f}'.format)
summary_table_df['Mean Sale Price'] = summary_table_df['Mean Sale Price'].map('${:,.0f}'.format)
summary_table_df['Median Sale Price'] = summary_table_df['Median Sale Price'].map('${:,.0f}'.format)
summary_table_df['Standard Deviation of Sale Price'] = summary_table_df['Standard Deviation of Sale Price'].map('${:,.0f}'.format)

summary_table_df



Unnamed: 0_level_0,Total Number of Sales,Mean Sale Price,Median Sale Price,Standard Deviation of Sale Price
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,20724,"$243,574","$148,900","$920,857"
2017,22818,"$279,002","$162,000","$704,872"
2018,23224,"$297,678","$174,000","$1,928,769"
2019,22889,"$393,367","$175,000","$1,908,636"
2020,9824,"$302,294","$195,000","$582,757"
