In [1]:
%matplotlib notebook

## Philadelphia Housing Analysis - ETL Notebook

## 1.) Dependencies and setup

In [3]:
# 1a.) Define imports
import pandas as pd
import matplotlib.pyplot as plt

# 1b.) Define source data location and source file 
source_file = 'source_data/housing_data_cleaned.csv'

## 2.) Read in source dataset

In [4]:
# 2a.) Read in source dataset using pd.read_csv
housing_df = pd.read_csv(source_file,low_memory=False)
# 2b.) Verify df load by outputting head
housing_df.head()

Unnamed: 0,basements,building_code_description,category_code_description,census_tract,central_air,depth,exempt_building,exempt_land,exterior_condition,fireplaces,...,topography,total_area,total_livable_area,type_heater,unit,view_type,year_built,year_built_estimate,zip_code,zoning
0,D,ROW 3 STY MASONRY,Single Family,241.0,N,67.0,49200,0,4.0,0.0,...,F,938.0,1344.0,A,,I,1895,Y,19144,RSA5
1,,ROW 2 STY MASONRY,Single Family,201.0,,70.0,0,0,4.0,0.0,...,F,1044.0,1190.0,,,I,1940,Y,19140,RM1
2,H,ROW B/GAR 2 STY MASONRY,Single Family,281.0,N,95.5,0,0,4.0,0.0,...,F,1686.53,1633.0,B,,I,1940,Y,19141,RSA3
3,,ROW 2 STY MASONRY,Single Family,293.0,,112.5,0,0,4.0,0.0,...,F,2165.62,1320.0,B,,I,1940,Y,19124,RSA5
4,,ROW 2 STY MASONRY,Single Family,62.0,,79.0,0,0,4.0,0.0,...,F,1264.0,960.0,,,I,1920,,19142,RM1


## 3.) Summary Statistics

In [6]:
# 3a.) Prep dataframe read in in step 2 for outputting a summary statistics table
# Create column for year of sale
# NOTE: approach taken from https://stackoverflow.com/questions/25146121/extracting-just-month-and-year-separately-from-pandas-datetime-column
housing_df['sale_year'] = pd.DatetimeIndex(housing_df['sale_date']).year
housing_df['sale_year'].value_counts()

2018    16283
2019    15787
2017    15619
2016    14060
2020     6736
Name: sale_year, dtype: int64

In [7]:
# 3b.) Create group by object and summary dataframe
# Create condensed df with just sale price and year to create summary statistics table
summary_df = housing_df[['sale_price','sale_year']]
summary_df = summary_df.rename(columns={'sale_year':'Year'})

# Create groupby object, group by sale year
summary_df_groupby_obj = summary_df.groupby('Year')

# Create summarys stats table
summary_table_df = summary_df_groupby_obj.agg(['count','mean','median','std'])

# Rename columns

summary_table_df.columns = ['Total Number of Sales','Mean Sale Price','Median Sale Price','Standard Deviation of Sale Price']

In [8]:
# 3c.) Reformat data and output
summary_table_df['Total Number of Sales'] = summary_table_df['Total Number of Sales'].map('{:,.0f}'.format)
summary_table_df['Mean Sale Price'] = summary_table_df['Mean Sale Price'].map('${:,.0f}'.format)
summary_table_df['Median Sale Price'] = summary_table_df['Median Sale Price'].map('${:,.0f}'.format)
summary_table_df['Standard Deviation of Sale Price'] = summary_table_df['Standard Deviation of Sale Price'].map('${:,.0f}'.format)

summary_table_df

Unnamed: 0_level_0,Total Number of Sales,Mean Sale Price,Median Sale Price,Standard Deviation of Sale Price
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,14060,"$177,132","$125,000","$240,260"
2017,15619,"$219,403","$140,000","$408,161"
2018,16283,"$206,449","$155,000","$272,475"
2019,15787,"$221,614","$153,000","$433,686"
2020,6736,"$257,225","$170,000","$429,846"
