# Building Permit Data

## Documentation

[United States Census Bureau Building Permits Survey](https://www.census.gov/construction/bps/)

[ASCII files by State, Metropolitan Statistical Area (MSA), County or Place](https://www2.census.gov/econ/bps/)

[MSA Folder](https://www2.census.gov/econ/bps/Metro/)

[ASCII MSA Documentation](https://www2.census.gov/econ/bps/Documentation/msaasc.pdf)

In [1]:
import numpy as np
import pandas as pd

import re

import os.path
from os import path

from datetime import datetime

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from sklearn.cluster import KMeans

import wrangle as wr
import preprocessing_permits as pr
import explore as ex
import model as mo

import warnings
warnings.filterwarnings("ignore")

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
def prep_building_permits(df):
    
    df["city"] = df.cbsa_name.str.split("  ", 1, expand = True)[0]
    
    df["state"] = df.cbsa_name.str.split("  ", 1, expand = True)[1]
    
    df["major_city"] = df.city.str.split("-", 1, expand=True)[0]
    
    df["major_state"] = df.state.str.split("-", 1, expand=True)[0]
    
    df["metropolitan_area"] = df.state.str.split("-", 1, expand=True)[1]
    
    df["metropolitan_area"] = df.major_state.str.split(" ", 1, expand=True)[1]
    
    df["major_state"] = df.major_state.str.split(" ", 1, expand=True)[0]
    
    df = df.groupby(["major_city","major_state", "survey_date"]).sum().reset_index()
    
    return df

## Acquire

In [4]:
df = wr.acquire_building_permits()
print(f"""Our DataFrame contains {df.shape[0]:,} observations and {df.shape[1]} features.""")
df

Our DataFrame contains 14,149 observations and 29 features.


Unnamed: 0,survey_date,csa_code,cbsa_code,moncov,cbsa_name,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
0,2019,104.0,10580.0,True,Albany-Schenectady-Troy NY,1120.0,1120.0,309397.0,20.0,40.0,7644.0,12.0,45.0,6074.0,48.0,665.0,60456.0,984.0,984.0,268946.0,18.0,36.0,6544.0,12.0,45.0,6074.0,34.0,580.0,56469.0
1,2019,430.0,48260.0,False,Weirton-Steubenville WV-OH,25.0,25.0,5782.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,25.0,5782.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2019,999.0,10180.0,False,Abilene TX,354.0,354.0,72824.0,8.0,16.0,2093.0,0.0,0.0,0.0,0.0,0.0,0.0,353.0,353.0,72596.0,8.0,16.0,2093.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019,566.0,49660.0,False,Youngstown-Warren-Boardman OH-PA,323.0,323.0,73182.0,2.0,4.0,407.0,1.0,3.0,467.0,0.0,0.0,0.0,234.0,234.0,50054.0,2.0,4.0,407.0,1.0,3.0,467.0,0.0,0.0,0.0
4,2019,558.0,48700.0,False,Williamsport PA,66.0,66.0,16215.0,6.0,12.0,1610.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,49.0,12095.0,6.0,12.0,1610.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14144,1980,5745.0,9999.0,False,NORTHEAST PENNSYLVANIA SMSA,1146.0,1146.0,42642.0,3.0,6.0,91.0,6.0,23.0,440.0,5.0,627.0,15798.0,1055.0,1055.0,39843.0,3.0,6.0,91.0,6.0,23.0,440.0,5.0,627.0,15798.0
14145,1980,5720.0,9999.0,False,NORFOLK-VIRGINIA BEACH-,2806.0,2806.0,146250.0,110.0,220.0,6606.0,61.0,231.0,7896.0,201.0,1521.0,44621.0,2806.0,2806.0,146250.0,110.0,220.0,6606.0,61.0,231.0,7896.0,201.0,1521.0,44621.0
14146,1980,5680.0,9999.0,False,NEWPORT NEWS-HAMPTON SMSA,1435.0,1435.0,65952.0,2.0,4.0,30.0,0.0,0.0,0.0,25.0,192.0,3146.0,1435.0,1435.0,65952.0,2.0,4.0,30.0,0.0,0.0,0.0,25.0,192.0,3146.0
14147,1980,5640.0,9999.0,False,NEWARK SMSA,2156.0,2156.0,137423.0,59.0,118.0,3407.0,13.0,47.0,1927.0,34.0,1353.0,40279.0,2154.0,2154.0,137349.0,59.0,118.0,3407.0,13.0,47.0,1927.0,32.0,1343.0,40208.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14149 entries, 0 to 14148
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   survey_date                    14149 non-null  int64  
 1   csa_code                       14149 non-null  float64
 2   cbsa_code                      14149 non-null  float64
 3   moncov                         14149 non-null  bool   
 4   cbsa_name                      14149 non-null  object 
 5   one_unit_bldgs_est             14149 non-null  float64
 6   one_unit_units_est             14149 non-null  float64
 7   one_unit_value_est             14149 non-null  float64
 8   two_units_bldgs_est            14149 non-null  float64
 9   two_units_units_est            14149 non-null  float64
 10  two_units_value_est            14149 non-null  float64
 11  three_to_four_units_bldgs_est  14149 non-null  float64
 12  three_to_four_units_units_est  14149 non-null 

In [6]:
print(f"There are {len(df.cbsa_name.unique()):,} unique metropolitan areas in the DataFrame.")

There are 2,639 unique metropolitan areas in the DataFrame.


In [7]:
print(f"""This DataFrame contains survey data from {df.survey_date.min()} through {df.survey_date.max()}.""")

This DataFrame contains survey data from 1980 through 2019.


In [8]:
df.cbsa_name.head()

0          Albany-Schenectady-Troy  NY 
1          Weirton-Steubenville  WV-OH 
2                          Abilene  TX 
3    Youngstown-Warren-Boardman  OH-PA 
4                     Williamsport  PA 
Name: cbsa_name, dtype: object

In [9]:
df.cbsa_name.tail()

14144    NORTHEAST PENNSYLVANIA SMSA
14145        NORFOLK-VIRGINIA BEACH-
14146      NEWPORT NEWS-HAMPTON SMSA
14147                    NEWARK SMSA
14148                  ROCKFORD SMSA
Name: cbsa_name, dtype: object

Unnamed: 0,survey_date,csa_code,cbsa_code,moncov,cbsa_name,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
0,2019,104.0,10580.0,True,Albany-Schenectady-Troy NY,1120.0,1120.0,309397.0,20.0,40.0,7644.0,12.0,45.0,6074.0,48.0,665.0,60456.0,984.0,984.0,268946.0,18.0,36.0,6544.0,12.0,45.0,6074.0,34.0,580.0,56469.0
1,2019,430.0,48260.0,False,Weirton-Steubenville WV-OH,25.0,25.0,5782.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,25.0,5782.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2019,999.0,10180.0,False,Abilene TX,354.0,354.0,72824.0,8.0,16.0,2093.0,0.0,0.0,0.0,0.0,0.0,0.0,353.0,353.0,72596.0,8.0,16.0,2093.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019,566.0,49660.0,False,Youngstown-Warren-Boardman OH-PA,323.0,323.0,73182.0,2.0,4.0,407.0,1.0,3.0,467.0,0.0,0.0,0.0,234.0,234.0,50054.0,2.0,4.0,407.0,1.0,3.0,467.0,0.0,0.0,0.0
4,2019,558.0,48700.0,False,Williamsport PA,66.0,66.0,16215.0,6.0,12.0,1610.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,49.0,12095.0,6.0,12.0,1610.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14144,1980,5745.0,9999.0,False,NORTHEAST PENNSYLVANIA SMSA,1146.0,1146.0,42642.0,3.0,6.0,91.0,6.0,23.0,440.0,5.0,627.0,15798.0,1055.0,1055.0,39843.0,3.0,6.0,91.0,6.0,23.0,440.0,5.0,627.0,15798.0
14145,1980,5720.0,9999.0,False,NORFOLK-VIRGINIA BEACH-,2806.0,2806.0,146250.0,110.0,220.0,6606.0,61.0,231.0,7896.0,201.0,1521.0,44621.0,2806.0,2806.0,146250.0,110.0,220.0,6606.0,61.0,231.0,7896.0,201.0,1521.0,44621.0
14146,1980,5680.0,9999.0,False,NEWPORT NEWS-HAMPTON SMSA,1435.0,1435.0,65952.0,2.0,4.0,30.0,0.0,0.0,0.0,25.0,192.0,3146.0,1435.0,1435.0,65952.0,2.0,4.0,30.0,0.0,0.0,0.0,25.0,192.0,3146.0
14147,1980,5640.0,9999.0,False,NEWARK SMSA,2156.0,2156.0,137423.0,59.0,118.0,3407.0,13.0,47.0,1927.0,34.0,1353.0,40279.0,2154.0,2154.0,137349.0,59.0,118.0,3407.0,13.0,47.0,1927.0,32.0,1343.0,40208.0


## Prepare + Preprocessing

In [11]:
# df = prep_building_permits(df)
# print(f"""Our DataFrame contains {df.shape[0]:,} observations and {df.shape[1]} features.""")
# df

In [12]:
# df.sort_values(by=["major_city", "major_state", "survey_date"], inplace=True)
# df

In [13]:
df = pr.get_permits_model_df()
print(f"""Our DataFrame contains {df.shape[0]:,} observations and {df.shape[1]} features.""")
df

Our DataFrame contains 8,269 observations and 6 features.


Unnamed: 0,city,state,year,total_high_density_bldgs,total_high_density_units,total_high_density_value
0,Abilene,TX,1997,0.0,0.0,0.0
1,Abilene,TX,1998,0.0,0.0,0.0
2,Abilene,TX,1999,0.0,0.0,0.0
3,Abilene,TX,2000,15.0,192.0,10200000.0
4,Abilene,TX,2001,13.0,192.0,6333000.0
...,...,...,...,...,...,...
8264,Yuma,AZ,2015,0.0,0.0,0.0
8265,Yuma,AZ,2016,0.0,0.0,0.0
8266,Yuma,AZ,2017,8.0,68.0,8986000.0
8267,Yuma,AZ,2018,0.0,0.0,0.0


In [14]:
# df = df[
#     [
#         "city",
#         "state",
#         "year",
#         "five_or_more_units_bldgs_est",
#         "five_or_more_units_units_est",
#         "five_or_more_units_value_est",
#     ]
# ]

In [15]:
df = pr.add_new_features(df)
print(f"""Our DataFrame contains {df.shape[0]:,} observations and {df.shape[1]} features.""")
df

Our DataFrame contains 8,269 observations and 15 features.


Unnamed: 0,city,state,year,total_high_density_bldgs,total_high_density_units,total_high_density_value,avg_units_per_bldg,value_per_bldg,value_per_unit,city_state_high_density_bldgs_delta_pct,city_state_high_density_units_delta_pct,city_state_high_density_value_delta_pct,market_volume,market_volume_delta_pct,ei
0,Abilene,TX,1997,0.0,0.0,0.0,,,,,,,2.054924e+10,,
1,Abilene,TX,1998,0.0,0.0,0.0,,,,,,,2.529787e+10,0.231085,
2,Abilene,TX,1999,0.0,0.0,0.0,,,,,,,2.609590e+10,0.031545,
3,Abilene,TX,2000,15.0,192.0,10200000.0,12.800000,6.800000e+05,53125.000000,inf,inf,inf,2.742204e+10,0.050818,inf
4,Abilene,TX,2001,13.0,192.0,6333000.0,14.769231,4.871538e+05,32984.375000,-0.133333,0.0,-0.379118,2.913103e+10,0.062322,0.584458
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8264,Yuma,AZ,2015,0.0,0.0,0.0,,,,,,,5.200240e+10,0.313639,
8265,Yuma,AZ,2016,0.0,0.0,0.0,,,,,,,4.928300e+10,-0.052294,
8266,Yuma,AZ,2017,8.0,68.0,8986000.0,8.500000,1.123250e+06,132147.058824,inf,inf,inf,5.158824e+10,0.046775,inf
8267,Yuma,AZ,2018,0.0,0.0,0.0,,,,-1.000000,-1.0,-1.000000,5.336251e+10,0.034393,0.000000


In [16]:
# def filter_top_cities_building_permits(df):
#     """
#     This function masks df in two ways:
#     city_mask returns cities with only continuously reported data
#     threshold_mask returns cities where they had at least one "5 or more unit" building permit for every year
#     Returns 130 cities for modeling
#     """
#     df["city_state"] = df["city"] + "_" + df["state"]
    
#     city_mask = df.groupby("city_state").year.count()
    
#     city_mask = city_mask[city_mask == 23]
    
#     # apply city mask to shrink the df
#     def in_city_mask(x):
#         return x in city_mask
    
#     df = df[df.city_state.apply(in_city_mask)]
    
#     threshold_mask = df.groupby('city_state').total_high_density_bldgs.agg(lambda x: (x == 0).sum())
    
#     threshold_mask = threshold_mask[threshold_mask < 1].index.tolist()
    
#     # apply threshold mask to shrink the df 
#     def in_threshold_mask(x):
#         return x in threshold_mask
    
#     df = df[df.city_state.apply(in_threshold_mask)]
    
#     df = df.sort_values(["city", "state", "year"])
    
#     return df

In [17]:
df = pr.filter_top_cities_building_permits(df)
print(f"""Our DataFrame contains {df.shape[0]:,} observations and {df.shape[1]} features.""")
df

Our DataFrame contains 2,990 observations and 16 features.


Unnamed: 0,city,state,year,total_high_density_bldgs,total_high_density_units,total_high_density_value,avg_units_per_bldg,value_per_bldg,value_per_unit,city_state_high_density_bldgs_delta_pct,city_state_high_density_units_delta_pct,city_state_high_density_value_delta_pct,market_volume,market_volume_delta_pct,ei,city_state
0,Albany,NY,1997,30.0,425.0,17871000.0,14.166667,5.957000e+05,42049.411765,,,,2.054924e+10,-0.655038,,Albany_NY
1,Albany,NY,1998,47.0,1038.0,54232000.0,22.085106,1.153872e+06,52246.628131,0.566667,1.442353,2.034637,2.529787e+10,0.231085,2.465010,Albany_NY
2,Albany,NY,1999,39.0,515.0,24484000.0,13.205128,6.277949e+05,47541.747573,-0.170213,-0.503854,-0.548532,2.609590e+10,0.031545,0.437662,Albany_NY
3,Albany,NY,2000,25.0,346.0,16130000.0,13.840000,6.452000e+05,46618.497110,-0.358974,-0.328155,-0.341202,2.742204e+10,0.050818,0.626938,Albany_NY
4,Albany,NY,2001,56.0,502.0,24536000.0,8.964286,4.381429e+05,48876.494024,1.240000,0.450867,0.521141,2.913103e+10,0.062322,1.431902,Albany_NY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2985,York,PA,2015,3.0,53.0,3185000.0,17.666667,1.061667e+06,60094.339623,-0.666667,-0.320513,-0.449724,5.200240e+10,0.313639,0.418895,York_PA
2986,York,PA,2016,3.0,34.0,1566000.0,11.333333,5.220000e+05,46058.823529,0.000000,-0.358491,-0.508320,4.928300e+10,-0.052294,0.518810,York_PA
2987,York,PA,2017,15.0,83.0,10204000.0,5.533333,6.802667e+05,122939.759036,4.000000,1.441176,5.515964,5.158824e+10,0.046775,6.224796,York_PA
2988,York,PA,2018,2.0,26.0,2160000.0,13.000000,1.080000e+06,83076.923077,-0.866667,-0.686747,-0.788318,5.336251e+10,0.034393,0.204643,York_PA


In [18]:
df.isna().sum()

city                                         0
state                                        0
year                                         0
total_high_density_bldgs                     0
total_high_density_units                     0
total_high_density_value                     0
avg_units_per_bldg                           0
value_per_bldg                               0
value_per_unit                               0
city_state_high_density_bldgs_delta_pct    130
city_state_high_density_units_delta_pct    130
city_state_high_density_value_delta_pct    130
market_volume                                0
market_volume_delta_pct                      0
ei                                         130
city_state                                   0
dtype: int64

In [19]:
df[df.year == 1997]

Unnamed: 0,city,state,year,total_high_density_bldgs,total_high_density_units,total_high_density_value,avg_units_per_bldg,value_per_bldg,value_per_unit,city_state_high_density_bldgs_delta_pct,city_state_high_density_units_delta_pct,city_state_high_density_value_delta_pct,market_volume,market_volume_delta_pct,ei,city_state
0,Albany,NY,1997,30.0,425.0,17871000.0,14.166667,5.957000e+05,42049.411765,,,,2.054924e+10,-0.655038,,Albany_NY
23,Albuquerque,NM,1997,51.0,1588.0,71413000.0,31.137255,1.400255e+06,44970.403023,,,,2.054924e+10,-0.655038,,Albuquerque_NM
46,Allentown,PA,1997,15.0,238.0,9028000.0,15.866667,6.018667e+05,37932.773109,,,,2.054924e+10,-0.655038,,Allentown_PA
69,Anchorage,AK,1997,18.0,162.0,14467000.0,9.000000,8.037222e+05,89302.469136,,,,2.054924e+10,-0.655038,,Anchorage_AK
92,Appleton,WI,1997,84.0,1046.0,35004000.0,12.452381,4.167143e+05,33464.627151,,,,2.054924e+10,-0.655038,,Appleton_WI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,Tuscaloosa,AL,1997,28.0,366.0,11029000.0,13.071429,3.938929e+05,30133.879781,,,,2.054924e+10,-0.655038,,Tuscaloosa_AL
2898,Washington,DC,1997,998.0,15835.0,763535000.0,15.866733,7.650651e+05,48218.187559,,,,2.054924e+10,-0.655038,,Washington_DC
2921,Wilmington,NC,1997,108.0,1568.0,56381000.0,14.518519,5.220463e+05,35957.270408,,,,2.054924e+10,-0.351657,,Wilmington_NC
2944,Worcester,MA,1997,1.0,110.0,5351000.0,110.000000,5.351000e+06,48645.454545,,,,2.054924e+10,-0.655038,,Worcester_MA


In [20]:
# df.sort_values(["year"]).groupby(["city", "state"])[["five_or_more_units_value_est"]].sum()

In [21]:
# df.sort_values(["year"]).groupby(["city", "state"])[["five_or_more_units_units_est"]].sum()

In [22]:
# df["avg_units_per_bldg"] = df["five_or_more_units_units_est"] / df["five_or_more_units_bldgs_est"]

In [23]:
# df["value_per_bldg"] = df["five_or_more_units_value_est"] / df["five_or_more_units_bldgs_est"]

In [24]:
# df["value_per_unit"] = df["five_or_more_units_value_est"] / df["five_or_more_units_units_est"]

In [25]:
# df

In [26]:
# def labeling_future_data(df):
#     """this function takes in a data frame and returns a boolean column that identifies
#     if a city_state_year is a market that should be entered"""
    
#     df["five_or_more_units_bldgs_est_2y"] = (df.sort_values(["year"])
#                                   .groupby(["city", "state"])[["total_high_density_bldgs"]]
#                                   .pct_change(2)
#                                   .shift(-2))
    
#     df["five_or_more_units_value_est_2y"] = (df.sort_values(["year"])
#                                   .groupby(["city", "state"])[["total_high_density_value"]]
#                                   .pct_change(2)
#                                   .shift(-2))
    
#     Q3 = df.five_or_more_units_bldgs_est_2y.quantile(.75)
    
#     Q1 = df.five_or_more_units_bldgs_est_2y.quantile(.25)
    
#     upper_fence_quantity = Q3 + ((Q3-Q1)*1.5)
    
#     Q3 = df.five_or_more_units_value_est_2y.quantile(.75)
    
#     Q1 = df.five_or_more_units_value_est_2y.quantile(.25)
    
#     upper_fence_volume = Q3 + ((Q3-Q1)*1.5)
    
#     df['should_enter'] = (df.five_or_more_units_value_est_2y > upper_fence_volume) | (df.five_or_more_units_bldgs_est_2y > upper_fence_quantity)
    
#     return df

In [27]:
df = pr.labeling_future_data(df)
print(f"""Our DataFrame contains {df.shape[0]:,} observations and {df.shape[1]} features.""")
df

Our DataFrame contains 2,990 observations and 19 features.


Unnamed: 0,city,state,year,total_high_density_bldgs,total_high_density_units,total_high_density_value,avg_units_per_bldg,value_per_bldg,value_per_unit,city_state_high_density_bldgs_delta_pct,city_state_high_density_units_delta_pct,city_state_high_density_value_delta_pct,market_volume,market_volume_delta_pct,ei,city_state,five_or_more_units_bldgs_est_2y,five_or_more_units_value_est_2y,should_enter
0,Albany,NY,1997,30.0,425.0,17871000.0,14.166667,5.957000e+05,42049.411765,,,,2.054924e+10,-0.655038,,Albany_NY,,,False
1,Albany,NY,1998,47.0,1038.0,54232000.0,22.085106,1.153872e+06,52246.628131,0.566667,1.442353,2.034637,2.529787e+10,0.231085,2.465010,Albany_NY,,,False
2,Albany,NY,1999,39.0,515.0,24484000.0,13.205128,6.277949e+05,47541.747573,-0.170213,-0.503854,-0.548532,2.609590e+10,0.031545,0.437662,Albany_NY,0.170673,0.472309,False
3,Albany,NY,2000,25.0,346.0,16130000.0,13.840000,6.452000e+05,46618.497110,-0.358974,-0.328155,-0.341202,2.742204e+10,0.050818,0.626938,Albany_NY,0.625000,0.181828,False
4,Albany,NY,2001,56.0,502.0,24536000.0,8.964286,4.381429e+05,48876.494024,1.240000,0.450867,0.521141,2.913103e+10,0.062322,1.431902,Albany_NY,-0.266667,-0.570875,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2985,York,PA,2015,3.0,53.0,3185000.0,17.666667,1.061667e+06,60094.339623,-0.666667,-0.320513,-0.449724,5.200240e+10,0.313639,0.418895,York_PA,-0.109677,0.450996,False
2986,York,PA,2016,3.0,34.0,1566000.0,11.333333,5.220000e+05,46058.823529,0.000000,-0.358491,-0.508320,4.928300e+10,-0.052294,0.518810,York_PA,0.093333,0.179576,False
2987,York,PA,2017,15.0,83.0,10204000.0,5.533333,6.802667e+05,122939.759036,4.000000,1.441176,5.515964,5.158824e+10,0.046775,6.224796,York_PA,-0.407407,0.268569,False
2988,York,PA,2018,2.0,26.0,2160000.0,13.000000,1.080000e+06,83076.923077,-0.866667,-0.686747,-0.788318,5.336251e+10,0.034393,0.204643,York_PA,-0.138996,-0.141515,False


### All Preprocessing Together

In [28]:
df = pr.get_permits_model_df()
df = pr.add_new_features(df)
df = pr.filter_top_cities_building_permits(df)
df = pr.labeling_future_data(df)
print(f"""Our DataFrame contains {df.shape[0]:,} observations and {df.shape[1]} features.""")
df

Our DataFrame contains 2,990 observations and 19 features.


Unnamed: 0,city,state,year,total_high_density_bldgs,total_high_density_units,total_high_density_value,avg_units_per_bldg,value_per_bldg,value_per_unit,city_state_high_density_bldgs_delta_pct,city_state_high_density_units_delta_pct,city_state_high_density_value_delta_pct,market_volume,market_volume_delta_pct,ei,city_state,five_or_more_units_bldgs_est_2y,five_or_more_units_value_est_2y,should_enter
0,Albany,NY,1997,30.0,425.0,17871000.0,14.166667,5.957000e+05,42049.411765,,,,2.054924e+10,-0.655038,,Albany_NY,,,False
1,Albany,NY,1998,47.0,1038.0,54232000.0,22.085106,1.153872e+06,52246.628131,0.566667,1.442353,2.034637,2.529787e+10,0.231085,2.465010,Albany_NY,,,False
2,Albany,NY,1999,39.0,515.0,24484000.0,13.205128,6.277949e+05,47541.747573,-0.170213,-0.503854,-0.548532,2.609590e+10,0.031545,0.437662,Albany_NY,0.170673,0.472309,False
3,Albany,NY,2000,25.0,346.0,16130000.0,13.840000,6.452000e+05,46618.497110,-0.358974,-0.328155,-0.341202,2.742204e+10,0.050818,0.626938,Albany_NY,0.625000,0.181828,False
4,Albany,NY,2001,56.0,502.0,24536000.0,8.964286,4.381429e+05,48876.494024,1.240000,0.450867,0.521141,2.913103e+10,0.062322,1.431902,Albany_NY,-0.266667,-0.570875,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2985,York,PA,2015,3.0,53.0,3185000.0,17.666667,1.061667e+06,60094.339623,-0.666667,-0.320513,-0.449724,5.200240e+10,0.313639,0.418895,York_PA,-0.109677,0.450996,False
2986,York,PA,2016,3.0,34.0,1566000.0,11.333333,5.220000e+05,46058.823529,0.000000,-0.358491,-0.508320,4.928300e+10,-0.052294,0.518810,York_PA,0.093333,0.179576,False
2987,York,PA,2017,15.0,83.0,10204000.0,5.533333,6.802667e+05,122939.759036,4.000000,1.441176,5.515964,5.158824e+10,0.046775,6.224796,York_PA,-0.407407,0.268569,False
2988,York,PA,2018,2.0,26.0,2160000.0,13.000000,1.080000e+06,83076.923077,-0.866667,-0.686747,-0.788318,5.336251e+10,0.034393,0.204643,York_PA,-0.138996,-0.141515,False


In [29]:
df = pr.permits_preprocessing_mother_function()
print(f"""Our DataFrame contains {df.shape[0]:,} observations and {df.shape[1]} features.""")
df

Our DataFrame contains 2,990 observations and 19 features.


Unnamed: 0,city,state,year,total_high_density_bldgs,total_high_density_units,total_high_density_value,avg_units_per_bldg,value_per_bldg,value_per_unit,city_state_high_density_bldgs_delta_pct,city_state_high_density_units_delta_pct,city_state_high_density_value_delta_pct,market_volume,market_volume_delta_pct,ei,city_state,five_or_more_units_bldgs_est_2y,five_or_more_units_value_est_2y,should_enter
0,Albany,NY,1997,30.0,425.0,17871000.0,14.166667,5.957000e+05,42049.411765,,,,2.054924e+10,-0.655038,,Albany_NY,,,False
1,Albany,NY,1998,47.0,1038.0,54232000.0,22.085106,1.153872e+06,52246.628131,0.566667,1.442353,2.034637,2.529787e+10,0.231085,2.465010,Albany_NY,,,False
2,Albany,NY,1999,39.0,515.0,24484000.0,13.205128,6.277949e+05,47541.747573,-0.170213,-0.503854,-0.548532,2.609590e+10,0.031545,0.437662,Albany_NY,0.170673,0.472309,False
3,Albany,NY,2000,25.0,346.0,16130000.0,13.840000,6.452000e+05,46618.497110,-0.358974,-0.328155,-0.341202,2.742204e+10,0.050818,0.626938,Albany_NY,0.625000,0.181828,False
4,Albany,NY,2001,56.0,502.0,24536000.0,8.964286,4.381429e+05,48876.494024,1.240000,0.450867,0.521141,2.913103e+10,0.062322,1.431902,Albany_NY,-0.266667,-0.570875,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2985,York,PA,2015,3.0,53.0,3185000.0,17.666667,1.061667e+06,60094.339623,-0.666667,-0.320513,-0.449724,5.200240e+10,0.313639,0.418895,York_PA,-0.109677,0.450996,False
2986,York,PA,2016,3.0,34.0,1566000.0,11.333333,5.220000e+05,46058.823529,0.000000,-0.358491,-0.508320,4.928300e+10,-0.052294,0.518810,York_PA,0.093333,0.179576,False
2987,York,PA,2017,15.0,83.0,10204000.0,5.533333,6.802667e+05,122939.759036,4.000000,1.441176,5.515964,5.158824e+10,0.046775,6.224796,York_PA,-0.407407,0.268569,False
2988,York,PA,2018,2.0,26.0,2160000.0,13.000000,1.080000e+06,83076.923077,-0.866667,-0.686747,-0.788318,5.336251e+10,0.034393,0.204643,York_PA,-0.138996,-0.141515,False
