In [1]:
# import data science libraries
import numpy as np
import pandas as pd

import re

import os.path
from os import path

from datetime import datetime

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from sklearn.cluster import KMeans

import wrangle as wr
import preprocessing_permits as pr
import explore as ex
import model as mo

import warnings
warnings.filterwarnings("ignore")

In [2]:
# global setting for DataFrames and visualizations
pd.set_option("display.max_columns", None)
plt.rc("figure", figsize=(16, 8))
sns.set_palette("colorblind")

In [3]:
df = wr.acquire_building_permits()
print(f"""Our DataFrame contains {df.shape[0]:,} observations and {df.shape[1]} features.""")
df

Our DataFrame contains 8,382 observations and 29 features.


Unnamed: 0,survey_date,csa_code,cbsa_code,moncov,cbsa_name,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
0,2019,104.0,10580.0,True,Albany-Schenectady-Troy NY,1120.0,1120.0,309397.0,20.0,40.0,7644.0,12.0,45.0,6074.0,48.0,665.0,60456.0,984.0,984.0,268946.0,18.0,36.0,6544.0,12.0,45.0,6074.0,34.0,580.0,56469.0
1,2019,430.0,48260.0,False,Weirton-Steubenville WV-OH,25.0,25.0,5782.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,25.0,5782.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2019,999.0,10180.0,False,Abilene TX,354.0,354.0,72824.0,8.0,16.0,2093.0,0.0,0.0,0.0,0.0,0.0,0.0,353.0,353.0,72596.0,8.0,16.0,2093.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019,566.0,49660.0,False,Youngstown-Warren-Boardman OH-PA,323.0,323.0,73182.0,2.0,4.0,407.0,1.0,3.0,467.0,0.0,0.0,0.0,234.0,234.0,50054.0,2.0,4.0,407.0,1.0,3.0,467.0,0.0,0.0,0.0
4,2019,558.0,48700.0,False,Williamsport PA,66.0,66.0,16215.0,6.0,12.0,1610.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,49.0,12095.0,6.0,12.0,1610.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8377,1997,6240.0,9999.0,False,Pine Bluff AR MSA,100.0,100.0,4638.0,10.0,20.0,405.0,0.0,0.0,0.0,1.0,96.0,1826.0,100.0,100.0,4638.0,10.0,20.0,405.0,0.0,0.0,0.0,1.0,96.0,1826.0
8378,1997,6280.0,9999.0,False,Pittsburgh PA MSA,4432.0,4432.0,551173.0,66.0,132.0,9580.0,52.0,194.0,12291.0,102.0,1306.0,65021.0,3705.0,3705.0,481092.0,60.0,120.0,9251.0,45.0,173.0,12002.0,81.0,1111.0,56757.0
8379,1997,6320.0,9999.0,False,Pittsfield MA MSA,97.0,97.0,11578.0,3.0,6.0,342.0,0.0,0.0,0.0,1.0,120.0,5750.0,80.0,80.0,9447.0,3.0,6.0,342.0,0.0,0.0,0.0,1.0,120.0,5750.0
8380,1997,6340.0,9999.0,False,Pocatello ID MSA,263.0,263.0,23778.0,8.0,16.0,870.0,20.0,80.0,3630.0,2.0,14.0,543.0,263.0,263.0,23778.0,8.0,16.0,870.0,20.0,80.0,3630.0,2.0,14.0,543.0


In [4]:
df["city"] = df.cbsa_name.str.split("  ", 1, expand = True)[0]
    
df["state"] = df.cbsa_name.str.split("  ", 1, expand = True)[1]

df["major_city"] = df.city.str.split("-", 1, expand=True)[0]

df["major_state"] = df.state.str.split("-", 1, expand=True)[0]

df["metropolitan_area"] = df.state.str.split("-", 1, expand=True)[1]

df["metropolitan_area"] = df.major_state.str.split(" ", 1, expand=True)[1]

df["major_state"] = df.major_state.str.split(" ", 1, expand=True)[0]

In [5]:
df[(df.major_city == "York") & (df.major_state == "PA")]

Unnamed: 0,survey_date,csa_code,cbsa_code,moncov,cbsa_name,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep,city,state,major_city,major_state,metropolitan_area
223,2019,276.0,49620.0,False,York-Hanover PA,858.0,858.0,158812.0,10.0,20.0,3365.0,13.0,46.0,6003.0,25.0,389.0,26196.0,797.0,797.0,145328.0,10.0,20.0,3365.0,13.0,46.0,6003.0,25.0,389.0,26196.0,York-Hanover,PA,York,PA,
543,2018,276.0,49620.0,False,York-Hanover PA,788.0,788.0,154074.0,21.0,42.0,5293.0,16.0,60.0,5623.0,2.0,26.0,2160.0,761.0,761.0,148972.0,21.0,42.0,5293.0,16.0,60.0,5623.0,2.0,26.0,2160.0,York-Hanover,PA,York,PA,
927,2017,276.0,49620.0,False,York-Hanover PA,678.0,678.0,149906.0,11.0,22.0,2234.0,14.0,55.0,5070.0,15.0,83.0,10204.0,575.0,575.0,123932.0,11.0,22.0,2234.0,14.0,55.0,5070.0,15.0,83.0,10204.0,York-Hanover,PA,York,PA,
1310,2016,276.0,49620.0,False,York-Hanover PA,769.0,769.0,144941.0,8.0,16.0,2019.0,15.0,59.0,7762.0,3.0,34.0,1566.0,769.0,769.0,144941.0,8.0,16.0,2019.0,15.0,59.0,7762.0,3.0,34.0,1566.0,York-Hanover,PA,York,PA,
1692,2015,276.0,49620.0,False,York-Hanover PA,707.0,707.0,132269.0,1.0,2.0,177.0,4.0,16.0,1939.0,3.0,53.0,3185.0,689.0,689.0,128355.0,1.0,2.0,177.0,4.0,16.0,1939.0,3.0,53.0,3185.0,York-Hanover,PA,York,PA,
2071,2014,276.0,49620.0,False,York-Hanover PA,720.0,720.0,131120.0,3.0,6.0,758.0,3.0,12.0,1303.0,9.0,78.0,5788.0,717.0,717.0,130581.0,3.0,6.0,758.0,3.0,12.0,1303.0,9.0,78.0,5788.0,York-Hanover,PA,York,PA,
2354,2013,564.0,49620.0,False,York-Hanover PA,744.0,744.0,124464.0,2.0,4.0,404.0,15.0,55.0,5644.0,11.0,125.0,7035.0,744.0,744.0,124464.0,2.0,4.0,404.0,15.0,55.0,5644.0,11.0,125.0,7035.0,York-Hanover,PA,York,PA,
2720,2012,564.0,49620.0,False,York-Hanover PA,695.0,695.0,110416.0,2.0,4.0,560.0,15.0,49.0,3265.0,6.0,57.0,5326.0,695.0,695.0,110416.0,2.0,4.0,560.0,15.0,49.0,3265.0,6.0,57.0,5326.0,York-Hanover,PA,York,PA,
3086,2011,564.0,49620.0,False,York-Hanover PA,582.0,582.0,98395.0,1.0,2.0,61.0,2.0,8.0,757.0,8.0,46.0,4163.0,566.0,566.0,95752.0,1.0,2.0,61.0,2.0,8.0,757.0,8.0,46.0,4163.0,York-Hanover,PA,York,PA,
3453,2010,564.0,49620.0,False,York-Hanover PA,858.0,858.0,146564.0,0.0,0.0,0.0,1.0,4.0,360.0,11.0,197.0,18407.0,837.0,837.0,142784.0,0.0,0.0,0.0,1.0,4.0,360.0,11.0,197.0,18407.0,York-Hanover,PA,York,PA,


In [6]:
df = wr.prep_building_permits(df)
print(f"""Our DataFrame contains {df.shape[0]:,} observations and {df.shape[1]} features.""")
df

Our DataFrame contains 8,269 observations and 30 features.


Unnamed: 0,major_city,major_state,survey_date,csa_code,cbsa_code,moncov,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
0,Abilene,TX,1997,40.0,9999.0,False,215.0,215.0,25825.0,0.0,0.0,0.0,1.0,3.0,125.0,0.0,0.0,0.0,215.0,215.0,25825.0,0.0,0.0,0.0,1.0,3.0,125.0,0.0,0.0,0.0
1,Abilene,TX,1998,40.0,9999.0,False,250.0,250.0,32745.0,2.0,4.0,180.0,0.0,0.0,0.0,0.0,0.0,0.0,250.0,250.0,32745.0,2.0,4.0,180.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Abilene,TX,1999,40.0,9999.0,False,179.0,179.0,25372.0,3.0,6.0,382.0,0.0,0.0,0.0,0.0,0.0,0.0,179.0,179.0,25372.0,3.0,6.0,382.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Abilene,TX,2000,40.0,9999.0,False,164.0,164.0,22306.0,9.0,18.0,1140.0,0.0,0.0,0.0,15.0,192.0,10200.0,164.0,164.0,22306.0,9.0,18.0,1140.0,0.0,0.0,0.0,15.0,192.0,10200.0
4,Abilene,TX,2001,40.0,9999.0,False,172.0,172.0,25539.0,28.0,56.0,3286.0,41.0,164.0,7631.0,13.0,192.0,6333.0,172.0,172.0,25539.0,28.0,56.0,3286.0,41.0,164.0,7631.0,13.0,192.0,6333.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8264,Yuma,AZ,2015,999.0,49740.0,False,765.0,765.0,117198.0,0.0,0.0,0.0,1.0,3.0,106.0,0.0,0.0,0.0,765.0,765.0,117198.0,0.0,0.0,0.0,1.0,3.0,106.0,0.0,0.0,0.0
8265,Yuma,AZ,2016,999.0,49740.0,False,890.0,890.0,137493.0,0.0,0.0,0.0,2.0,7.0,712.0,0.0,0.0,0.0,824.0,824.0,131881.0,0.0,0.0,0.0,2.0,7.0,712.0,0.0,0.0,0.0
8266,Yuma,AZ,2017,999.0,49740.0,False,1005.0,1005.0,163723.0,2.0,4.0,245.0,6.0,24.0,3118.0,8.0,68.0,8986.0,993.0,993.0,161953.0,1.0,2.0,245.0,6.0,24.0,3118.0,8.0,68.0,8986.0
8267,Yuma,AZ,2018,999.0,49740.0,False,1011.0,1011.0,160289.0,2.0,4.0,262.0,0.0,0.0,0.0,0.0,0.0,0.0,1011.0,1011.0,160289.0,2.0,4.0,262.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df[(df.major_city == "York") & (df.major_state == "PA")]

Unnamed: 0,major_city,major_state,survey_date,csa_code,cbsa_code,moncov,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
8177,York,PA,1997,9280.0,9999.0,False,1619.0,1619.0,165567.0,49.0,98.0,5878.0,21.0,83.0,3838.0,16.0,120.0,6016.0,1489.0,1489.0,151527.0,46.0,92.0,5566.0,21.0,83.0,3838.0,16.0,120.0,6016.0
8178,York,PA,1998,9280.0,9999.0,False,1849.0,1849.0,196757.0,30.0,60.0,3298.0,11.0,44.0,2236.0,9.0,129.0,5882.0,1629.0,1629.0,174141.0,21.0,42.0,2301.0,11.0,44.0,2236.0,9.0,129.0,5882.0
8179,York,PA,1999,9280.0,9999.0,False,1964.0,1964.0,214061.0,22.0,44.0,2711.0,12.0,47.0,2369.0,25.0,173.0,7354.0,1803.0,1803.0,196346.0,22.0,44.0,2711.0,12.0,47.0,2369.0,25.0,173.0,7354.0
8180,York,PA,2000,9280.0,9999.0,False,1714.0,1714.0,196263.0,32.0,64.0,4430.0,15.0,59.0,3765.0,14.0,172.0,12018.0,1594.0,1594.0,179870.0,32.0,64.0,4430.0,14.0,55.0,3765.0,14.0,172.0,12018.0
8181,York,PA,2001,9280.0,9999.0,False,2036.0,2036.0,231661.0,25.0,50.0,5946.0,6.0,23.0,1065.0,9.0,76.0,6471.0,1809.0,1809.0,209912.0,23.0,46.0,5706.0,5.0,20.0,1065.0,9.0,76.0,6471.0
8182,York,PA,2002,9280.0,9999.0,False,2423.0,2423.0,295783.0,74.0,148.0,12420.0,8.0,29.0,1131.0,26.0,356.0,16440.0,2198.0,2198.0,276505.0,74.0,148.0,12420.0,6.0,22.0,1131.0,26.0,356.0,16440.0
8183,York,PA,2003,564.0,49620.0,False,2708.0,2708.0,344121.0,20.0,40.0,2338.0,2.0,7.0,240.0,38.0,393.0,20486.0,2572.0,2572.0,325031.0,20.0,40.0,2338.0,1.0,4.0,240.0,37.0,388.0,19952.0
8184,York,PA,2004,564.0,49620.0,False,2899.0,2899.0,400026.0,13.0,26.0,1984.0,4.0,14.0,968.0,18.0,148.0,9001.0,2719.0,2719.0,375566.0,13.0,26.0,1984.0,3.0,11.0,968.0,18.0,148.0,9001.0
8185,York,PA,2005,564.0,49620.0,False,2870.0,2870.0,461702.0,48.0,96.0,5199.0,14.0,55.0,2717.0,14.0,95.0,10995.0,2628.0,2628.0,424793.0,48.0,96.0,5199.0,14.0,55.0,2717.0,11.0,80.0,9495.0
8186,York,PA,2006,564.0,49620.0,False,2467.0,2467.0,444909.0,42.0,84.0,6708.0,20.0,80.0,6712.0,23.0,272.0,28591.0,2285.0,2285.0,415223.0,42.0,84.0,6708.0,20.0,80.0,6712.0,17.0,242.0,25146.0


In [8]:
df.head(46)

Unnamed: 0,major_city,major_state,survey_date,csa_code,cbsa_code,moncov,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
0,Abilene,TX,1997,40.0,9999.0,False,215.0,215.0,25825.0,0.0,0.0,0.0,1.0,3.0,125.0,0.0,0.0,0.0,215.0,215.0,25825.0,0.0,0.0,0.0,1.0,3.0,125.0,0.0,0.0,0.0
1,Abilene,TX,1998,40.0,9999.0,False,250.0,250.0,32745.0,2.0,4.0,180.0,0.0,0.0,0.0,0.0,0.0,0.0,250.0,250.0,32745.0,2.0,4.0,180.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Abilene,TX,1999,40.0,9999.0,False,179.0,179.0,25372.0,3.0,6.0,382.0,0.0,0.0,0.0,0.0,0.0,0.0,179.0,179.0,25372.0,3.0,6.0,382.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Abilene,TX,2000,40.0,9999.0,False,164.0,164.0,22306.0,9.0,18.0,1140.0,0.0,0.0,0.0,15.0,192.0,10200.0,164.0,164.0,22306.0,9.0,18.0,1140.0,0.0,0.0,0.0,15.0,192.0,10200.0
4,Abilene,TX,2001,40.0,9999.0,False,172.0,172.0,25539.0,28.0,56.0,3286.0,41.0,164.0,7631.0,13.0,192.0,6333.0,172.0,172.0,25539.0,28.0,56.0,3286.0,41.0,164.0,7631.0,13.0,192.0,6333.0
5,Abilene,TX,2002,40.0,9999.0,False,153.0,153.0,23080.0,5.0,10.0,525.0,6.0,18.0,1000.0,0.0,0.0,0.0,153.0,153.0,23080.0,5.0,10.0,525.0,6.0,18.0,1000.0,0.0,0.0,0.0
6,Abilene,TX,2003,999.0,10180.0,False,199.0,199.0,29826.0,11.0,22.0,1439.0,0.0,0.0,0.0,0.0,0.0,0.0,191.0,191.0,29135.0,11.0,22.0,1439.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Abilene,TX,2004,999.0,10180.0,False,283.0,283.0,41533.0,4.0,8.0,611.0,0.0,0.0,0.0,0.0,0.0,0.0,280.0,280.0,41262.0,4.0,8.0,611.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Abilene,TX,2005,999.0,10180.0,False,310.0,310.0,48199.0,11.0,22.0,1639.0,0.0,0.0,0.0,1.0,14.0,1200.0,305.0,305.0,47711.0,10.0,20.0,1445.0,0.0,0.0,0.0,1.0,14.0,1200.0
9,Abilene,TX,2006,999.0,10180.0,False,333.0,333.0,58363.0,8.0,16.0,1394.0,0.0,0.0,0.0,1.0,16.0,1500.0,331.0,331.0,58147.0,7.0,14.0,1200.0,0.0,0.0,0.0,1.0,16.0,1500.0


In [9]:
df = pr.get_permits_model_df()
print(f"""Our modeling DataFrame contains {df.shape[0]:,} observations & {df.shape[1]} features""")
df.head(46)

Our modeling DataFrame contains 8,269 observations & 6 features


Unnamed: 0,city,state,year,total_high_density_bldgs,total_high_density_units,total_high_density_value
0,Abilene,TX,1997,0.0,0.0,0.0
1,Abilene,TX,1998,0.0,0.0,0.0
2,Abilene,TX,1999,0.0,0.0,0.0
3,Abilene,TX,2000,15.0,192.0,10200000.0
4,Abilene,TX,2001,13.0,192.0,6333000.0
5,Abilene,TX,2002,0.0,0.0,0.0
6,Abilene,TX,2003,0.0,0.0,0.0
7,Abilene,TX,2004,0.0,0.0,0.0
8,Abilene,TX,2005,1.0,14.0,1200000.0
9,Abilene,TX,2006,1.0,16.0,1500000.0


In [10]:
df["alec_test"] = (
    df.sort_values(["year"])
    .groupby(["city", "state"])[["total_high_density_value"]]
    .pct_change()
)

In [11]:
df.tail(46)

Unnamed: 0,city,state,year,total_high_density_bldgs,total_high_density_units,total_high_density_value,alec_test
8223,Yuba City,CA,1997,3.0,28.0,1111000.0,
8224,Yuba City,CA,1998,0.0,0.0,0.0,-1.0
8225,Yuba City,CA,1999,15.0,126.0,6752000.0,inf
8226,Yuba City,CA,2000,0.0,0.0,0.0,-1.0
8227,Yuba City,CA,2001,0.0,0.0,0.0,
8228,Yuba City,CA,2002,0.0,0.0,0.0,
8229,Yuba City,CA,2003,12.0,148.0,10281000.0,inf
8230,Yuba City,CA,2004,3.0,17.0,747000.0,-0.927342
8231,Yuba City,CA,2005,4.0,31.0,1602000.0,1.144578
8232,Yuba City,CA,2006,6.0,36.0,2019000.0,0.2603


In [12]:
df["new_field"] = df.sort_values(["year"]).groupby(["city", "state", "year"])[["total_high_density_value"]].pct_change()

In [13]:
(7485000.0 - 4566000.0) / 4566000.0

0.6392904073587385

In [14]:
(12492000.0 - 30583000.0) / 30583000.0

-0.5915377824281464

In [15]:
(1 + 2.034637) / (1 + 0.231085)

2.4650101333376657

In [16]:
df = pr.add_new_features(df)
print(f"""Our modeling DataFrame contains {df.shape[0]:,} observations & {df.shape[1]} features""")
df.head(46)

Our modeling DataFrame contains 8,269 observations & 17 features


Unnamed: 0,city,state,year,total_high_density_bldgs,total_high_density_units,total_high_density_value,alec_test,new_field,avg_units_per_bldg,value_per_bldg,value_per_unit,city_state_high_density_bldgs_delta_pct,city_state_high_density_units_delta_pct,city_state_high_density_value_delta_pct,market_volume,market_volume_delta_pct,ei
0,Abilene,TX,1997,0.0,0.0,0.0,,,,,,,,,20549240000.0,,
1,Abilene,TX,1998,0.0,0.0,0.0,,,,,,,,,25297870000.0,0.231085,
2,Abilene,TX,1999,0.0,0.0,0.0,,,,,,,,,26095900000.0,0.031545,
3,Abilene,TX,2000,15.0,192.0,10200000.0,inf,,12.8,680000.0,53125.0,inf,inf,inf,27422040000.0,0.050818,inf
4,Abilene,TX,2001,13.0,192.0,6333000.0,-0.379118,,14.769231,487153.8,32984.375,-0.133333,0.0,-0.379118,29131030000.0,0.062322,0.584458
5,Abilene,TX,2002,0.0,0.0,0.0,-1.0,,,,,-1.0,-1.0,-1.0,31695040000.0,0.088016,0.0
6,Abilene,TX,2003,0.0,0.0,0.0,,,,,,,,,23050500000.0,-0.272741,
7,Abilene,TX,2004,0.0,0.0,0.0,,,,,,,,,27238560000.0,0.181691,
8,Abilene,TX,2005,1.0,14.0,1200000.0,inf,,14.0,1200000.0,85714.285714,inf,inf,inf,34353080000.0,0.261193,inf
9,Abilene,TX,2006,1.0,16.0,1500000.0,0.25,,16.0,1500000.0,93750.0,0.0,0.142857,0.25,35494080000.0,0.033214,1.209817


In [17]:
(1 + -0.379118) / (1 + 0.062322)

0.5844574432234294

In [18]:
df.groupby("year").total_high_density_value.sum()

year
1997    2.054924e+10
1998    2.529787e+10
1999    2.609590e+10
2000    2.742204e+10
2001    2.913103e+10
2002    3.169504e+10
2003    2.305050e+10
2004    2.723856e+10
2005    3.435308e+10
2006    3.549408e+10
2007    3.375194e+10
2008    2.566581e+10
2009    1.005604e+10
2010    1.165572e+10
2011    1.574184e+10
2012    2.569934e+10
2013    3.321102e+10
2014    3.958652e+10
2015    5.200240e+10
2016    4.928300e+10
2017    5.158824e+10
2018    5.336251e+10
2019    5.956964e+10
Name: total_high_density_value, dtype: float64

In [19]:
df = pr.filter_top_cities_building_permits(df)
print(f"""Our modeling DataFrame contains {df.shape[0]:,} observations & {df.shape[1]} features""")
df.tail()

Our modeling DataFrame contains 2,990 observations & 18 features


Unnamed: 0,city,state,year,total_high_density_bldgs,total_high_density_units,total_high_density_value,alec_test,new_field,avg_units_per_bldg,value_per_bldg,value_per_unit,city_state_high_density_bldgs_delta_pct,city_state_high_density_units_delta_pct,city_state_high_density_value_delta_pct,market_volume,market_volume_delta_pct,ei,city_state
2985,York,PA,2015,3.0,53.0,3185000.0,-0.449724,,17.666667,1061667.0,60094.339623,-0.666667,-0.320513,-0.449724,52002400000.0,0.313639,0.418895,York_PA
2986,York,PA,2016,3.0,34.0,1566000.0,-0.50832,,11.333333,522000.0,46058.823529,0.0,-0.358491,-0.50832,49283000000.0,-0.052294,0.51881,York_PA
2987,York,PA,2017,15.0,83.0,10204000.0,5.515964,,5.533333,680266.7,122939.759036,4.0,1.441176,5.515964,51588240000.0,0.046775,6.224796,York_PA
2988,York,PA,2018,2.0,26.0,2160000.0,-0.788318,,13.0,1080000.0,83076.923077,-0.866667,-0.686747,-0.788318,53362510000.0,0.034393,0.204643,York_PA
2989,York,PA,2019,25.0,389.0,26196000.0,11.127778,,15.56,1047840.0,67341.902314,11.5,13.961538,11.127778,59569640000.0,0.11632,10.864068,York_PA


In [20]:
(4.928300e+10 - 5.200240e+10) / 5.200240e+10

-0.05229374028890974

In [21]:
(1 + -0.508320) / (1 + -0.052294)

0.518810686014439

In [22]:
df.groupby("year").total_high_density_value.sum()

year
1997    1.750160e+10
1998    2.181240e+10
1999    2.247141e+10
2000    2.364117e+10
2001    2.528117e+10
2002    2.676417e+10
2003    1.932928e+10
2004    2.220160e+10
2005    2.798906e+10
2006    3.100532e+10
2007    2.960063e+10
2008    2.262328e+10
2009    8.549583e+09
2010    9.750298e+09
2011    1.377148e+10
2012    2.299483e+10
2013    2.987518e+10
2014    3.625187e+10
2015    4.709459e+10
2016    4.439847e+10
2017    4.659169e+10
2018    4.795771e+10
2019    5.239455e+10
Name: total_high_density_value, dtype: float64