# Building Permit Data

## Documentation

[United States Census Bureau Building Permits Survey](https://www.census.gov/construction/bps/)

[ASCII files by State, Metropolitan Statistical Area (MSA), County or Place](https://www2.census.gov/econ/bps/)

[MSA Folder](https://www2.census.gov/econ/bps/Metro/)

[ASCII MSA Documentation](https://www2.census.gov/econ/bps/Documentation/msaasc.pdf)

In [1]:
import numpy as np
import pandas as pd

import re

import os.path
from os import path

from datetime import datetime

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from sklearn.preprocessing import MinMaxScaler, StandardScaler, PowerTransformer
from sklearn.cluster import KMeans

import wrangle as wr
import preprocessing as pr
import explore as ex
import model as mo

import warnings
warnings.filterwarnings("ignore")


Bad key "text.kerning_factor" on line 4 in
/usr/local/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [2]:
pd.set_option("display.max_columns", None)

In [3]:
def rename_columns(df):
    """
    Docstring
    """
    
    # rename columns inplace
    df.rename(
        columns={
            "Date": "survey_date",
            "Code": "csa_code",
            "Code.1": "cbsa_code",
            "Unnamed: 3": "moncov",
            "Name": "cbsa_name",
            "Bldgs": "one_unit_bldgs_est",
            "Units": "one_unit_units_est",
            "Value": "one_unit_value_est",
            "Bldgs.1": "two_units_bldgs_est",
            "Units.1": "two_units_units_est",
            "Value.1": "two_units_value_est",
            "Bldgs.2": "three_to_four_units_bldgs_est",
            "Units.2": "three_to_four_units_units_est",
            "Value.2": "three_to_four_units_value_est",
            "Bldgs.3": "five_or_more_units_bldgs_est",
            "Units.3": "five_or_more_units_units_est",
            "Value.3": "five_or_more_units_value_est",
            "Bldgs.4": "one_unit_bldgs_rep",
            "Units.4": "one_unit_units_rep",
            "Value.4": "one_unit_value_rep",
            "Bldgs.5": "two_units_bldgs_rep",
            "Units.5": "two_units_units_rep",
            "Value.5": "two_units_value_rep",
            "      Bldgs": "three_to_four_units_bldgs_rep",
            "Units.6": "three_to_four_units_units_rep",
            "Value.6": "three_to_four_units_value_rep",
            "Bldgs.6": "five_or_more_units_bldgs_rep",
            "Units.7": "five_or_more_units_units_rep",
            "Value.7": "five_or_more_units_value_rep",
        },
        inplace=True,
        )
    
    return df


In [4]:
# def sort_values_and_reset_index(df):
#     """
#     Docstring
#     """
    
#     # sort values by survey_date
#     df.sort_values(by=["survey_date"], ascending=False, inplace=True)

#     # reset index inplace
#     df.reset_index(inplace=True)

#     # drop former index inplace
#     df.drop(columns=["index"], inplace=True)
    
#     return df

In [5]:
def acquire_building_permits():
    """
    Docstring
    """
    
    # conditional
    if path.exists("building_permits.csv"):
        
        # read csv
        df = pd.read_csv("building_permits.csv", index_col=0)
        
    else:
    
        # create original df with 2019 data
        df = pd.read_csv("https://www2.census.gov/econ/bps/Metro/ma2019a.txt", sep=",", header=1)

        # rename columns
        rename_columns(df)

        for i in range(1980, 2019):

            # read the txt file at url where i is the year in range
            year_df = pd.read_csv(
                f"https://www2.census.gov/econ/bps/Metro/ma{i}a.txt",
                sep=",",
                header=1,
                names=df.columns.tolist(),
            )
            
            # append data to global df variable
            df = df.append(year_df, ignore_index=True)

        # make moncov into bool so that the null observations of this feature are not considered in the dropna below
        df["moncov"] = np.where(df.moncov == "C", 1, 0)

        # dropna inplace
        df.dropna(inplace=True)
        
        # chop off the succeding two digits after the year for survey_date
        df["survey_date"] = df.survey_date.astype(str).apply(lambda x: re.sub(r"\d\d$", "", x))
        
        # add a preceding "19" to any years where the length of the observation is 2 (e.i., "80"-"97")
        df["survey_date"] = df.survey_date.apply(lambda x: "19" + x if len(x) == 2 else x)
        
        # turn survey_date back into an int
        df["survey_date"] = df.survey_date.astype(int)
        
        # turn moncov back into a bool
        df["moncov"] = df.moncov.astype(bool)
        
        # sort values by survey_date
        df.sort_values(by=["survey_date"], ascending=False, inplace=True)

        # reset index inplace
        df.reset_index(inplace=True)

        # drop former index inplace
        df.drop(columns=["index"], inplace=True)
        
        # write df to disk as csv
        df.to_csv("building_permits.csv")
    
    return df

In [6]:
df = pd.read_csv("ma2019a.txt", sep=",", header=1)
df

Unnamed: 0,Date,Code,Code.1,Unnamed: 3,Name,Bldgs,Units,Value,Bldgs.1,Units.1,Value.1,Bldgs.2,Units.2,Value.2,Bldgs.3,Units.3,Value.3,Bldgs.4,Units.4,Value.4,Bldgs.5,Units.5,Value.5,Bldgs.7,Units.6,Value.6,Bldgs.6,Units.7,Value.7
0,201999,104,10580,C,Albany-Schenectady-Troy NY,1120,1120,309397,20,40,7644,12,45,6074,48,665,60456,984,984,268946,18,36,6544,12,45,6074,34,580,56469
1,201999,104,24020,,Glens Falls NY,229,229,57598,0,0,0,2,6,746,1,6,781,148,148,37724,0,0,0,2,6,746,1,6,781
2,201999,106,10740,C,Albuquerque NM,1872,1872,446893,1,2,283,4,12,1296,14,262,23162,1195,1195,296615,0,0,0,0,0,0,2,188,20079
3,201999,106,42140,C,Santa Fe NM,344,344,78685,0,0,0,0,0,0,8,245,30037,344,344,78685,0,0,0,0,0,0,8,245,30037
4,201999,107,11020,,Altoona PA,111,111,24333,0,0,0,0,0,0,0,0,0,105,105,22834,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,201999,999,48540,,Wheeling WV-OH,10,10,1114,1,2,90,0,0,0,0,0,0,9,9,1006,1,2,90,0,0,0,0,0,0
380,201999,999,48660,,Wichita Falls TX,133,133,38467,10,20,1871,0,0,0,2,36,4960,133,133,38467,10,20,1871,0,0,0,2,36,4960
381,201999,999,48900,,Wilmington NC,1950,1950,533699,12,24,5813,1,3,920,64,1598,190328,1388,1388,447764,11,22,5114,1,3,920,64,1598,190328
382,201999,999,49420,,Yakima WA,569,569,129660,35,70,11198,10,34,2996,21,370,30665,562,562,128703,35,70,11198,10,34,2996,21,370,30665


In [7]:
df.rename(columns={
    "Date": "survey_date",
    "Code": "csa_code",
    "Code.1": "cbsa_code",
    "Unnamed: 3": "moncov",
    "Name": "cbsa_name",
    "Bldgs": "one_unit_bldgs_est",
    "Units": "one_unit_units_est",
    "Value": "one_unit_value_est",
    "Bldgs.1": "two_units_bldgs_est",
    "Units.1": "two_units_units_est",
    "Value.1": "two_units_value_est",
    "Bldgs.2": "three_to_four_units_bldgs_est",
    "Units.2": "three_to_four_units_units_est",
    "Value.2": "three_to_four_units_value_est",
    "Bldgs.3": "five_or_more_units_bldgs_est",
    "Units.3": "five_or_more_units_units_est",
    "Value.3": "five_or_more_units_value_est",
    "Bldgs.4": "one_unit_bldgs_rep",
    "Units.4": "one_unit_units_rep",
    "Value.4": "one_unit_value_rep",
    "Bldgs.5": "two_units_bldgs_rep",
    "Units.5": "two_units_units_rep",
    "Value.5": "two_units_value_rep",
    "      Bldgs": "three_to_four_units_bldgs_rep",
    "Units.6": "three_to_four_units_units_rep",
    "Value.6": "three_to_four_units_value_rep",
    "Bldgs.6": "five_or_more_units_bldgs_rep",
    "Units.7": "five_or_more_units_units_rep",
    "Value.7": "five_or_more_units_value_rep",
}, inplace=True)

df

Unnamed: 0,survey_date,csa_code,cbsa_code,moncov,cbsa_name,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
0,201999,104,10580,C,Albany-Schenectady-Troy NY,1120,1120,309397,20,40,7644,12,45,6074,48,665,60456,984,984,268946,18,36,6544,12,45,6074,34,580,56469
1,201999,104,24020,,Glens Falls NY,229,229,57598,0,0,0,2,6,746,1,6,781,148,148,37724,0,0,0,2,6,746,1,6,781
2,201999,106,10740,C,Albuquerque NM,1872,1872,446893,1,2,283,4,12,1296,14,262,23162,1195,1195,296615,0,0,0,0,0,0,2,188,20079
3,201999,106,42140,C,Santa Fe NM,344,344,78685,0,0,0,0,0,0,8,245,30037,344,344,78685,0,0,0,0,0,0,8,245,30037
4,201999,107,11020,,Altoona PA,111,111,24333,0,0,0,0,0,0,0,0,0,105,105,22834,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,201999,999,48540,,Wheeling WV-OH,10,10,1114,1,2,90,0,0,0,0,0,0,9,9,1006,1,2,90,0,0,0,0,0,0
380,201999,999,48660,,Wichita Falls TX,133,133,38467,10,20,1871,0,0,0,2,36,4960,133,133,38467,10,20,1871,0,0,0,2,36,4960
381,201999,999,48900,,Wilmington NC,1950,1950,533699,12,24,5813,1,3,920,64,1598,190328,1388,1388,447764,11,22,5114,1,3,920,64,1598,190328
382,201999,999,49420,,Yakima WA,569,569,129660,35,70,11198,10,34,2996,21,370,30665,562,562,128703,35,70,11198,10,34,2996,21,370,30665


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   survey_date                    384 non-null    int64 
 1   csa_code                       384 non-null    int64 
 2   cbsa_code                      384 non-null    int64 
 3   moncov                         384 non-null    object
 4   cbsa_name                      384 non-null    object
 5   one_unit_bldgs_est             384 non-null    int64 
 6   one_unit_units_est             384 non-null    int64 
 7   one_unit_value_est             384 non-null    int64 
 8   two_units_bldgs_est            384 non-null    int64 
 9   two_units_units_est            384 non-null    int64 
 10  two_units_value_est            384 non-null    int64 
 11  three_to_four_units_bldgs_est  384 non-null    int64 
 12  three_to_four_units_units_est  384 non-null    int64 
 13  three

In [9]:
len(df.cbsa_name.unique())

384

In [10]:
df.columns.tolist()

['survey_date',
 'csa_code',
 'cbsa_code',
 'moncov',
 'cbsa_name',
 'one_unit_bldgs_est',
 'one_unit_units_est',
 'one_unit_value_est',
 'two_units_bldgs_est',
 'two_units_units_est',
 'two_units_value_est',
 'three_to_four_units_bldgs_est',
 'three_to_four_units_units_est',
 'three_to_four_units_value_est',
 'five_or_more_units_bldgs_est',
 'five_or_more_units_units_est',
 'five_or_more_units_value_est',
 'one_unit_bldgs_rep',
 'one_unit_units_rep',
 'one_unit_value_rep',
 'two_units_bldgs_rep',
 'two_units_units_rep',
 'two_units_value_rep',
 'three_to_four_units_bldgs_rep',
 'three_to_four_units_units_rep',
 'three_to_four_units_value_rep',
 'five_or_more_units_bldgs_rep',
 'five_or_more_units_units_rep',
 'five_or_more_units_value_rep']

In [11]:
# df_2018 = pd.read_csv("ma2018a.txt", sep=",", header=1, names=df.columns.tolist())
# df_2018

In [12]:
# df = df.append(df_2018, ignore_index=True)
# df.shape

In [13]:
# df_2017 = pd.read_csv("https://www2.census.gov/econ/bps/Metro/ma2017a.txt", sep=",", header=1, names=df.columns.tolist())
# df_2017

In [14]:
# df = df.append(df_2017, ignore_index=True)
# df.shape

In [15]:
for i in range(2017, 2019):
    temp_df = pd.read_csv(
        f"https://www2.census.gov/econ/bps/Metro/ma{i}a.txt",
        sep=",",
        header=1,
        names=df.columns.tolist(),
    )
    df = df.append(temp_df, ignore_index=True)

df

Unnamed: 0,survey_date,csa_code,cbsa_code,moncov,cbsa_name,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
0,201999,104,10580,C,Albany-Schenectady-Troy NY,1120,1120,309397,20,40,7644,12,45,6074,48,665,60456,984,984,268946,18,36,6544,12,45,6074,34,580,56469
1,201999,104,24020,,Glens Falls NY,229,229,57598,0,0,0,2,6,746,1,6,781,148,148,37724,0,0,0,2,6,746,1,6,781
2,201999,106,10740,C,Albuquerque NM,1872,1872,446893,1,2,283,4,12,1296,14,262,23162,1195,1195,296615,0,0,0,0,0,0,2,188,20079
3,201999,106,42140,C,Santa Fe NM,344,344,78685,0,0,0,0,0,0,8,245,30037,344,344,78685,0,0,0,0,0,0,8,245,30037
4,201999,107,11020,,Altoona PA,111,111,24333,0,0,0,0,0,0,0,0,0,105,105,22834,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1143,201899,999,48540,,Wheeling WV-OH,9,9,1435,2,4,300,1,3,270,1,100,12500,8,8,1327,2,4,300,1,3,270,1,100,12500
1144,201899,999,48660,,Wichita Falls TX,117,117,26725,4,8,420,0,0,0,8,195,17999,117,117,26725,4,8,420,0,0,0,8,195,17999
1145,201899,999,48900,,Wilmington NC,1868,1868,521252,16,32,7005,5,20,1956,16,530,77891,1290,1290,405914,12,24,4802,5,20,1956,14,473,69781
1146,201899,999,49420,,Yakima WA,480,480,111572,16,32,4670,14,51,7210,2,15,1609,452,452,106541,16,32,4670,14,51,7210,2,15,1609


In [16]:
for i in range(2010, 2017):
    # read the txt file at url where i is the year in range
    temp_df = pd.read_csv(
        f"https://www2.census.gov/econ/bps/Metro/ma{i}a.txt",
        sep=",",
        header=1,
        names=df.columns.tolist(),
    )
    
    # append data to global df variable
    df = df.append(temp_df, ignore_index=True)

df

Unnamed: 0,survey_date,csa_code,cbsa_code,moncov,cbsa_name,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
0,201999,104,10580,C,Albany-Schenectady-Troy NY,1120,1120,309397,20,40,7644,12,45,6074,48,665,60456,984,984,268946,18,36,6544,12,45,6074,34,580,56469
1,201999,104,24020,,Glens Falls NY,229,229,57598,0,0,0,2,6,746,1,6,781,148,148,37724,0,0,0,2,6,746,1,6,781
2,201999,106,10740,C,Albuquerque NM,1872,1872,446893,1,2,283,4,12,1296,14,262,23162,1195,1195,296615,0,0,0,0,0,0,2,188,20079
3,201999,106,42140,C,Santa Fe NM,344,344,78685,0,0,0,0,0,0,8,245,30037,344,344,78685,0,0,0,0,0,0,8,245,30037
4,201999,107,11020,,Altoona PA,111,111,24333,0,0,0,0,0,0,0,0,0,105,105,22834,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3751,201699,999,48540,,Wheeling WV-OH,18,18,3646,0,0,0,0,0,0,2,16,3064,17,17,3506,0,0,0,0,0,0,2,16,3064
3752,201699,999,48660,,Wichita Falls TX,94,94,24181,1,2,203,1,3,375,8,127,14171,94,94,24181,1,2,203,1,3,375,8,127,14171
3753,201699,999,48900,,Wilmington NC,1774,1774,430196,23,46,6378,0,0,0,39,1127,110492,1323,1323,358271,23,46,6378,0,0,0,39,1127,110492
3754,201699,999,49420,,Yakima WA,405,405,100328,5,10,1704,2,7,652,0,0,0,370,370,94514,5,10,1704,2,7,652,0,0,0


In [17]:
for i in range(2006, 2010):
    # read the txt file at url where i is the year in range
    temp_df = pd.read_csv(
        f"https://www2.census.gov/econ/bps/Metro/ma{i}a.txt",
        sep=",",
        header=1,
        names=df.columns.tolist(),
    )
    # append data to global df variable
    df = df.append(temp_df, ignore_index=True)

df

Unnamed: 0,survey_date,csa_code,cbsa_code,moncov,cbsa_name,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
0,201999,104,10580,C,Albany-Schenectady-Troy NY,1120,1120,309397,20,40,7644,12,45,6074,48,665,60456,984,984,268946,18,36,6544,12,45,6074,34,580,56469
1,201999,104,24020,,Glens Falls NY,229,229,57598,0,0,0,2,6,746,1,6,781,148,148,37724,0,0,0,2,6,746,1,6,781
2,201999,106,10740,C,Albuquerque NM,1872,1872,446893,1,2,283,4,12,1296,14,262,23162,1195,1195,296615,0,0,0,0,0,0,2,188,20079
3,201999,106,42140,C,Santa Fe NM,344,344,78685,0,0,0,0,0,0,8,245,30037,344,344,78685,0,0,0,0,0,0,8,245,30037
4,201999,107,11020,,Altoona PA,111,111,24333,0,0,0,0,0,0,0,0,0,105,105,22834,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5204,200999,999,48540,,Wheeling WV-OH,17,17,3152,0,0,0,0,0,0,0,0,0,15,15,3019,0,0,0,0,0,0,0,0,0
5205,200999,999,48660,,Wichita Falls TX,172,172,29037,18,36,3504,0,0,0,0,0,0,172,172,29037,18,36,3504,0,0,0,0,0,0
5206,200999,999,48900,C,Wilmington NC,1527,1527,283658,1,2,239,1,4,145,13,280,41939,1426,1426,265803,0,0,0,1,4,145,11,238,38051
5207,200999,999,49420,,Yakima WA,426,426,74800,7,14,1439,1,3,464,3,65,6501,405,405,71715,6,12,1229,1,3,464,3,65,6501


In [18]:
for i in range(1980, 2006):
    # read the txt file at url where i is the year in range
    temp_df = pd.read_csv(
        f"https://www2.census.gov/econ/bps/Metro/ma{i}a.txt",
        sep=",",
        header=1,
        names=df.columns.tolist(),
    )
    # append data to global df variable
    df = df.append(temp_df, ignore_index=True)

df

Unnamed: 0,survey_date,csa_code,cbsa_code,moncov,cbsa_name,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
0,201999,104.0,10580.0,C,Albany-Schenectady-Troy NY,1120.0,1120.0,309397.0,20.0,40.0,7644.0,12.0,45.0,6074.0,48.0,665.0,60456.0,984.0,984.0,268946.0,18.0,36.0,6544.0,12.0,45.0,6074.0,34.0,580.0,56469.0
1,201999,104.0,24020.0,,Glens Falls NY,229.0,229.0,57598.0,0.0,0.0,0.0,2.0,6.0,746.0,1.0,6.0,781.0,148.0,148.0,37724.0,0.0,0.0,0.0,2.0,6.0,746.0,1.0,6.0,781.0
2,201999,106.0,10740.0,C,Albuquerque NM,1872.0,1872.0,446893.0,1.0,2.0,283.0,4.0,12.0,1296.0,14.0,262.0,23162.0,1195.0,1195.0,296615.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,188.0,20079.0
3,201999,106.0,42140.0,C,Santa Fe NM,344.0,344.0,78685.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,245.0,30037.0,344.0,344.0,78685.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,245.0,30037.0
4,201999,107.0,11020.0,,Altoona PA,111.0,111.0,24333.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,105.0,105.0,22834.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14162,200599,999.0,48660.0,,Wichita Falls TX,315.0,315.0,48911.0,8.0,16.0,1190.0,41.0,164.0,11201.0,9.0,70.0,3476.0,279.0,279.0,45819.0,8.0,16.0,1190.0,41.0,164.0,11201.0,9.0,70.0,3476.0
14163,200599,999.0,48900.0,C,Wilmington NC,7685.0,7685.0,1277328.0,173.0,346.0,59417.0,13.0,50.0,5745.0,77.0,1125.0,122381.0,7499.0,7499.0,1259642.0,173.0,346.0,59417.0,13.0,50.0,5745.0,77.0,1125.0,122381.0
14164,200599,999.0,49420.0,,Yakima WA,739.0,739.0,135403.0,7.0,14.0,1407.0,0.0,0.0,0.0,2.0,27.0,2321.0,739.0,739.0,135403.0,7.0,14.0,1407.0,0.0,0.0,0.0,2.0,27.0,2321.0
14165,200599,999.0,49700.0,,Yuba City CA,2961.0,2961.0,511473.0,1.0,2.0,205.0,0.0,0.0,0.0,4.0,31.0,1602.0,2869.0,2869.0,495611.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,26.0,1383.0


In [19]:
df.isna().sum()

survey_date                         0
csa_code                           18
cbsa_code                          18
moncov                           2614
cbsa_name                          18
one_unit_bldgs_est                 18
one_unit_units_est                 18
one_unit_value_est                 18
two_units_bldgs_est                18
two_units_units_est                18
two_units_value_est                18
three_to_four_units_bldgs_est      18
three_to_four_units_units_est      18
three_to_four_units_value_est      18
five_or_more_units_bldgs_est       18
five_or_more_units_units_est       18
five_or_more_units_value_est       18
one_unit_bldgs_rep                 18
one_unit_units_rep                 18
one_unit_value_rep                 18
two_units_bldgs_rep                18
two_units_units_rep                18
two_units_value_rep                18
three_to_four_units_bldgs_rep      18
three_to_four_units_units_rep      18
three_to_four_units_value_rep      18
five_or_more

In [20]:
df[df.csa_code.isna()]

Unnamed: 0,survey_date,csa_code,cbsa_code,moncov,cbsa_name,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
5493,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5812,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6131,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6450,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6779,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7132,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7485,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7838,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8192,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8547,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [21]:
df.moncov.unique().tolist()

['C', ' ', nan, '    ']

In [22]:
df["moncov"] = np.where(df.moncov == "C", 1, 0)

In [23]:
df.isna().sum()

survey_date                       0
csa_code                         18
cbsa_code                        18
moncov                            0
cbsa_name                        18
one_unit_bldgs_est               18
one_unit_units_est               18
one_unit_value_est               18
two_units_bldgs_est              18
two_units_units_est              18
two_units_value_est              18
three_to_four_units_bldgs_est    18
three_to_four_units_units_est    18
three_to_four_units_value_est    18
five_or_more_units_bldgs_est     18
five_or_more_units_units_est     18
five_or_more_units_value_est     18
one_unit_bldgs_rep               18
one_unit_units_rep               18
one_unit_value_rep               18
two_units_bldgs_rep              18
two_units_units_rep              18
two_units_value_rep              18
three_to_four_units_bldgs_rep    18
three_to_four_units_units_rep    18
three_to_four_units_value_rep    18
five_or_more_units_bldgs_rep     18
five_or_more_units_units_rep

In [24]:
df.dropna(inplace=True)

In [25]:
df.isna().sum()

survey_date                      0
csa_code                         0
cbsa_code                        0
moncov                           0
cbsa_name                        0
one_unit_bldgs_est               0
one_unit_units_est               0
one_unit_value_est               0
two_units_bldgs_est              0
two_units_units_est              0
two_units_value_est              0
three_to_four_units_bldgs_est    0
three_to_four_units_units_est    0
three_to_four_units_value_est    0
five_or_more_units_bldgs_est     0
five_or_more_units_units_est     0
five_or_more_units_value_est     0
one_unit_bldgs_rep               0
one_unit_units_rep               0
one_unit_value_rep               0
two_units_bldgs_rep              0
two_units_units_rep              0
two_units_value_rep              0
three_to_four_units_bldgs_rep    0
three_to_four_units_units_rep    0
three_to_four_units_value_rep    0
five_or_more_units_bldgs_rep     0
five_or_more_units_units_rep     0
five_or_more_units_v

In [26]:
print(df.shape)
df.head(20)

(14149, 29)


Unnamed: 0,survey_date,csa_code,cbsa_code,moncov,cbsa_name,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
0,201999,104.0,10580.0,1,Albany-Schenectady-Troy NY,1120.0,1120.0,309397.0,20.0,40.0,7644.0,12.0,45.0,6074.0,48.0,665.0,60456.0,984.0,984.0,268946.0,18.0,36.0,6544.0,12.0,45.0,6074.0,34.0,580.0,56469.0
1,201999,104.0,24020.0,0,Glens Falls NY,229.0,229.0,57598.0,0.0,0.0,0.0,2.0,6.0,746.0,1.0,6.0,781.0,148.0,148.0,37724.0,0.0,0.0,0.0,2.0,6.0,746.0,1.0,6.0,781.0
2,201999,106.0,10740.0,1,Albuquerque NM,1872.0,1872.0,446893.0,1.0,2.0,283.0,4.0,12.0,1296.0,14.0,262.0,23162.0,1195.0,1195.0,296615.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,188.0,20079.0
3,201999,106.0,42140.0,1,Santa Fe NM,344.0,344.0,78685.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,245.0,30037.0,344.0,344.0,78685.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,245.0,30037.0
4,201999,107.0,11020.0,0,Altoona PA,111.0,111.0,24333.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,105.0,105.0,22834.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,201999,108.0,11100.0,0,Amarillo TX,554.0,554.0,133751.0,15.0,30.0,3862.0,0.0,0.0,0.0,0.0,0.0,0.0,546.0,546.0,131860.0,15.0,30.0,3862.0,0.0,0.0,0.0,0.0,0.0,0.0
6,201999,118.0,11540.0,0,Appleton WI,490.0,490.0,144349.0,27.0,54.0,8420.0,0.0,0.0,0.0,1.0,33.0,4500.0,490.0,490.0,144349.0,27.0,54.0,8420.0,0.0,0.0,0.0,1.0,33.0,4500.0
7,201999,118.0,36780.0,0,Oshkosh-Neenah WI,246.0,246.0,73147.0,8.0,16.0,2345.0,0.0,0.0,0.0,9.0,72.0,5760.0,246.0,246.0,73147.0,8.0,16.0,2345.0,0.0,0.0,0.0,9.0,72.0,5760.0
8,201999,120.0,11700.0,0,Asheville NC,2252.0,2252.0,639697.0,16.0,32.0,3730.0,0.0,0.0,0.0,56.0,970.0,89470.0,1804.0,1804.0,548571.0,16.0,32.0,3730.0,0.0,0.0,0.0,41.0,885.0,65933.0
9,201999,122.0,12020.0,0,Athens-Clarke County GA,821.0,821.0,172420.0,41.0,82.0,10221.0,16.0,55.0,5331.0,69.0,633.0,72609.0,753.0,753.0,167977.0,41.0,82.0,10221.0,16.0,55.0,5331.0,69.0,633.0,72609.0


In [27]:
# df.to_csv("building_permits.csv")

In [28]:
# df = pd.read_csv("building_permits.csv", index_col=0)
# print(f"""Our DataFrame contains {df.shape[0]:,} observations and {df.shape[1]} features.""")
# df.head()

In [29]:
# df["survey_date"] = df.survey_date.astype(str).apply(lambda x: re.sub(r"\d\d$", "", x))
# df

In [30]:
# df.survey_date.unique()

In [31]:
# len(df.survey_date)

In [32]:
# df["survey_date"] = df.survey_date.astype(str).apply(lambda x: "19" + x if len(x) == 2 else x)
# df

In [33]:
# df.survey_date.unique().tolist()

In [34]:
# df.info()

In [35]:
# df = pd.read_csv("building_permits.csv", index_col=0)
# df

In [36]:
# df["survey_date"] = df.survey_date.astype(str).apply(lambda x: re.sub(r"\d\d$", "", x))
# df.survey_date.unique().tolist()

In [37]:
# df.info()

In [38]:
# df["survey_date"] = df.survey_date.apply(lambda x: "19" + x if len(x) == 2 else x)

In [39]:
# df.survey_date.unique().tolist()

In [40]:
# df["survey_date"] = df.survey_date.astype(int)
# df.info()

In [41]:
# df = acquire_building_permits()
# df

In [42]:
# df.reset_index(inplace=True)

In [43]:
# df.drop(columns=["index"], inplace=True)
# df

In [44]:
df = acquire_building_permits()
print(f"""Our DataFrame contains {df.shape[0]:,} observations and {df.shape[1]} features.""")
df

Our DataFrame contains 14,149 observations and 29 features.


Unnamed: 0,survey_date,csa_code,cbsa_code,moncov,cbsa_name,one_unit_bldgs_est,one_unit_units_est,one_unit_value_est,two_units_bldgs_est,two_units_units_est,two_units_value_est,three_to_four_units_bldgs_est,three_to_four_units_units_est,three_to_four_units_value_est,five_or_more_units_bldgs_est,five_or_more_units_units_est,five_or_more_units_value_est,one_unit_bldgs_rep,one_unit_units_rep,one_unit_value_rep,two_units_bldgs_rep,two_units_units_rep,two_units_value_rep,three_to_four_units_bldgs_rep,three_to_four_units_units_rep,three_to_four_units_value_rep,five_or_more_units_bldgs_rep,five_or_more_units_units_rep,five_or_more_units_value_rep
0,2019,104.0,10580.0,True,Albany-Schenectady-Troy NY,1120.0,1120.0,309397.0,20.0,40.0,7644.0,12.0,45.0,6074.0,48.0,665.0,60456.0,984.0,984.0,268946.0,18.0,36.0,6544.0,12.0,45.0,6074.0,34.0,580.0,56469.0
1,2019,430.0,48260.0,False,Weirton-Steubenville WV-OH,25.0,25.0,5782.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,25.0,5782.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2019,999.0,10180.0,False,Abilene TX,354.0,354.0,72824.0,8.0,16.0,2093.0,0.0,0.0,0.0,0.0,0.0,0.0,353.0,353.0,72596.0,8.0,16.0,2093.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2019,566.0,49660.0,False,Youngstown-Warren-Boardman OH-PA,323.0,323.0,73182.0,2.0,4.0,407.0,1.0,3.0,467.0,0.0,0.0,0.0,234.0,234.0,50054.0,2.0,4.0,407.0,1.0,3.0,467.0,0.0,0.0,0.0
4,2019,558.0,48700.0,False,Williamsport PA,66.0,66.0,16215.0,6.0,12.0,1610.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0,49.0,12095.0,6.0,12.0,1610.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14144,1980,5745.0,9999.0,False,NORTHEAST PENNSYLVANIA SMSA,1146.0,1146.0,42642.0,3.0,6.0,91.0,6.0,23.0,440.0,5.0,627.0,15798.0,1055.0,1055.0,39843.0,3.0,6.0,91.0,6.0,23.0,440.0,5.0,627.0,15798.0
14145,1980,5720.0,9999.0,False,NORFOLK-VIRGINIA BEACH-,2806.0,2806.0,146250.0,110.0,220.0,6606.0,61.0,231.0,7896.0,201.0,1521.0,44621.0,2806.0,2806.0,146250.0,110.0,220.0,6606.0,61.0,231.0,7896.0,201.0,1521.0,44621.0
14146,1980,5680.0,9999.0,False,NEWPORT NEWS-HAMPTON SMSA,1435.0,1435.0,65952.0,2.0,4.0,30.0,0.0,0.0,0.0,25.0,192.0,3146.0,1435.0,1435.0,65952.0,2.0,4.0,30.0,0.0,0.0,0.0,25.0,192.0,3146.0
14147,1980,5640.0,9999.0,False,NEWARK SMSA,2156.0,2156.0,137423.0,59.0,118.0,3407.0,13.0,47.0,1927.0,34.0,1353.0,40279.0,2154.0,2154.0,137349.0,59.0,118.0,3407.0,13.0,47.0,1927.0,32.0,1343.0,40208.0


In [45]:
# df.sort_values(by=["survey_date"], ascending=False, inplace=True)

In [46]:
# df

In [47]:
# df.info()

In [48]:
# df.moncov.unique().tolist()

In [49]:
# df["moncov"] = df.moncov.astype(bool)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14149 entries, 0 to 14148
Data columns (total 29 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   survey_date                    14149 non-null  int64  
 1   csa_code                       14149 non-null  float64
 2   cbsa_code                      14149 non-null  float64
 3   moncov                         14149 non-null  bool   
 4   cbsa_name                      14149 non-null  object 
 5   one_unit_bldgs_est             14149 non-null  float64
 6   one_unit_units_est             14149 non-null  float64
 7   one_unit_value_est             14149 non-null  float64
 8   two_units_bldgs_est            14149 non-null  float64
 9   two_units_units_est            14149 non-null  float64
 10  two_units_value_est            14149 non-null  float64
 11  three_to_four_units_bldgs_est  14149 non-null  float64
 12  three_to_four_units_units_est  14149 non-null 

In [50]:
# df

In [51]:
# df = sort_values_and_reset_index(df)
# df