In [3]:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import re
from scipy import stats
from pathlib import Path

# directory and file paths
data_dir = Path("../data/")
raw_data_dir = data_dir / "raw"
processed_data_dir = data_dir / "processed"
interim_data_dir = data_dir / "interim"

raw_rental_vacancy_file = raw_data_dir / "rental_vacancy_rates_75.csv"
raw_homeowner_vacancy_file = raw_data_dir / "homeowner_vacancy_rates_75.csv"
raw_homeownership_file = raw_data_dir / "homeownership_rates_75.csv"
raw_covid_cases_file = raw_data_dir / "COVID-19_Case_Surveillance_Public_Use_Data.csv"

interim_rental_vacancy_file = interim_data_dir / "rental_vacancy_rates_75.csv"
interim_homeowner_vacancy_file = interim_data_dir / "homeowner_vacancy_rates_75.csv"
interim_homeownership_file = interim_data_dir / "homeownership_rates_75.csv"

# Housing Vacancies and Homeownership (Census Data)

## Rental Vacancy Rates for the 75 Largest MSAs (2015 - 2020)

In [4]:
# import and clean dataset
rental_vacancy_df = pd.read_csv(interim_rental_vacancy_file)

rental_vacancy_df["Metropolitan Statistical Area"] = rental_vacancy_df["Metropolitan Statistical Area"
                                                                      ].str.replace('[\.]+','')
rental_vacancy_df["Metropolitan Statistical Area"] = rental_vacancy_df["Metropolitan Statistical Area"
                                                                      ].str.replace('[\…]+','')
rental_vacancy_df = rental_vacancy_df.replace('(z)', np.NaN, regex=False)
rental_vacancy_df = rental_vacancy_df.rename(columns=lambda x: re.sub('First[\s]*Quarter','Q1',x))
rental_vacancy_df = rental_vacancy_df.rename(columns=lambda x: re.sub('Second[\s]*Quarter','Q2',x))
rental_vacancy_df = rental_vacancy_df.rename(columns=lambda x: re.sub('Third[\s]*Quarter','Q3',x))
rental_vacancy_df = rental_vacancy_df.rename(columns=lambda x: re.sub('Fourth[\s]*Quarter','Q4',x))
rental_vacancy_df = rental_vacancy_df.rename(columns=lambda x: re.sub('[\s]+',' ',x))

rental_vacancy_df

Unnamed: 0,Metropolitan Statistical Area,Q1 2020,Margin of Error1,Q2 2020,Margin of Error1.1,Q3 2020,Margin of Error1.2,Q4 2020,Margin of Error1.3,Q1 2019,...,Q4 2016,Margin of Error1.18,Q1 2015,Margin of Error3,Q2 2015,Margin of Error3.1,Q3 2015,Margin of Error3.2,Q4 2015,Margin of Error3.3
0,"Akron, OH",10.2,9.2,8.1,8.5,3.3,5.3,3.6,5.0,2.0,...,5.2,8.0,3.5,5.1,14.6,9.7,19.9,11.9,10.3,8.9
1,"Albany-Schenectady-Troy, NY",7.9,6.7,6.7,5.6,10.1,7.1,15.3,10.2,7.9,...,3.6,4.6,9.6,6.8,7.9,5.7,5.6,5.1,3.0,3.9
2,"Albuquerque, NM",4.3,2.8,6.9,4,6.8,3.4,4.1,2.5,7.1,...,11.6,4.3,7.3,3.3,7.7,3.3,7.4,3.6,6.4,3.5
3,"Allentown-Bethlehem-Easton, PA-NJ",4.5,5.9,5.6,6.6,2.9,4.2,2.5,4.9,7.7,...,4.6,6.7,3.5,4.6,3.4,5.1,6.0,6.3,2.4,3.7
4,"Atlanta-Sandy Springs-Roswell, GA1",6.9,2.5,5.1,2.3,5.8,2.4,7.7,2.7,7.8,...,6.4,2.6,9.5,2.5,8.4,2.3,7.9,2.3,6.8,2.3
5,"Austin-Round Rock, TX",6.4,4.1,9.8,5.3,7.8,5.4,3.0,2.9,12.3,...,6.7,4.1,6.6,3.5,4.5,3.1,9.4,4.3,3.4,2.6
6,"Baltimore-Columbia-Towson, MD2",4.7,3.3,6.1,3.8,9.6,4.7,8.0,4.5,7.7,...,7.9,4.5,9.2,3.8,7.3,3.5,7.8,3.6,6.9,3.4
7,"Baton Rouge, LA",5,4,5.6,4.6,9.3,5.7,10.3,6.1,8.2,...,4.7,4.1,7.4,6.3,8.0,6.0,7.4,5.9,8.5,6.8
8,"Birmingham-Hoover, AL",20.6,7.4,27.3,8.2,13.4,6.5,10.9,6.3,11.5,...,12.3,5.8,13.1,7.1,18.1,8.0,19.4,8.9,20.1,8.2
9,"Boston-Cambridge-Newton, MA-NH3",6.8,2.3,3,1.5,3.3,1.6,5.5,2.0,1.7,...,4.8,2.1,4.1,1.9,2.6,1.5,2.8,1.5,4.0,1.8


In [5]:
# create a slice of the dataframe without the margins of error
rv_quarters_df = rental_vacancy_df[['Metropolitan Statistical Area', 
                                    'Q1 2020','Q2 2020','Q3 2020','Q4 2020',  
                                    'Q1 2019','Q2 2019','Q3 2019','Q4 2019',  
                                    'Q1 2018','Q2 2018','Q3 2018','Q4 2018',  
                                    'Q1 2017','Q2 2017','Q3 2017','Q4 2017',  
                                    'Q1 2016','Q2 2016','Q3 2016','Q4 2016',  
                                    'Q1 2015','Q2 2015','Q3 2015','Q4 2015']]
rv_quarters_df.head()

Unnamed: 0,Metropolitan Statistical Area,Q1 2020,Q2 2020,Q3 2020,Q4 2020,Q1 2019,Q2 2019,Q3 2019,Q4 2019,Q1 2018,...,Q3 2017,Q4 2017,Q1 2016,Q2 2016,Q3 2016,Q4 2016,Q1 2015,Q2 2015,Q3 2015,Q4 2015
0,"Akron, OH",10.2,8.1,3.3,3.6,2.0,3.6,4.0,14.2,2.3,...,3.0,1.0,12.4,10.9,8.0,5.2,3.5,14.6,19.9,10.3
1,"Albany-Schenectady-Troy, NY",7.9,6.7,10.1,15.3,7.9,10.8,15.5,15.6,9.0,...,4.0,8.5,3.9,3.3,4.4,3.6,9.6,7.9,5.6,3.0
2,"Albuquerque, NM",4.3,6.9,6.8,4.1,7.1,6.8,7.8,4.4,10.7,...,10.4,6.3,6.4,5.9,8.1,11.6,7.3,7.7,7.4,6.4
3,"Allentown-Bethlehem-Easton, PA-NJ",4.5,5.6,2.9,2.5,7.7,3.2,0.0,6.1,7.4,...,5.4,10.8,5.0,2.6,4.5,4.6,3.5,3.4,6.0,2.4
4,"Atlanta-Sandy Springs-Roswell, GA1",6.9,5.1,5.8,7.7,7.8,7.6,6.4,5.9,8.3,...,7.3,8.8,5.3,7.6,5.7,6.4,9.5,8.4,7.9,6.8


## Homeowner Vacancy Rates for the 75 Largest MSAs (2015 - 2020)

In [6]:
# import and clean dataset
homeowner_vacancy_df = pd.read_csv(interim_homeowner_vacancy_file)
homeowner_vacancy_df["Metropolitan Statistical Area"] = homeowner_vacancy_df["Metropolitan Statistical Area"
                                                                      ].str.replace('[\.]+','')
homeowner_vacancy_df["Metropolitan Statistical Area"] = homeowner_vacancy_df["Metropolitan Statistical Area"
                                                                      ].str.replace('[\…]+','')
homeowner_vacancy_df = homeowner_vacancy_df.replace('(z)', np.NaN, regex=False)
homeowner_vacancy_df = homeowner_vacancy_df.rename(columns=lambda x: re.sub('First[\s]*Quarter','Q1',x))
homeowner_vacancy_df = homeowner_vacancy_df.rename(columns=lambda x: re.sub('Second[\s]*Quarter','Q2',x))
homeowner_vacancy_df = homeowner_vacancy_df.rename(columns=lambda x: re.sub('Third[\s]*Quarter','Q3',x))
homeowner_vacancy_df = homeowner_vacancy_df.rename(columns=lambda x: re.sub('Fourth[\s]*Quarter','Q4',x))
homeowner_vacancy_df = homeowner_vacancy_df.rename(columns=lambda x: re.sub('[\s]+',' ',x))
homeowner_vacancy_df

Unnamed: 0,Metropolitan Statistical Area,Q1 2020,Margin of Error1,Q2 2020,Margin of Error1.1,Q3 2020,Margin of Error1.2,Q4 2020,Margin of Error1.3,Q1 2019,...,Q4 2016,Margin of Error1.18,Q1 2015,Margin of Error3,Q2 2015,Margin of Error3.1,Q3 2015,Margin of Error3.2,Q4 2015,Margin of Error3.3
0,"Akron, OH",0.5,1.4,0.5,1.4,0.5,1.5,,,0.0,...,0.5,1.6,0.0,,0.5,1.3,0.8,1.5,0.5,1.3
1,"Albany-Schenectady-Troy, NY",0.9,1.9,0.9,1.9,5.3,4.2,4.9,4,1.6,...,2.2,2.9,1.3,2,2.8,2.7,1.8,2.1,1.1,1.7
2,"Albuquerque, NM",2,1.3,1.6,1.1,1,1,0.7,0.8,1.9,...,1.5,1.3,2.5,1.6,3.9,2,2.5,1.6,2.1,1.5
3,"Allentown-Bethlehem-Easton, PA-NJ",0.6,1.4,0.5,1.4,0.5,1.4,1.3,2.2,1.2,...,1.8,2.9,0.5,1.2,2.5,2.7,2.1,2.5,2.0,2.6
4,"Atlanta-Sandy Springs-Roswell, GA1",1.2,0.8,1,0.7,0.3,0.4,0.8,0.7,0.9,...,1.5,1,2.9,1.2,2.4,1.1,1.8,0.9,1.7,0.9
5,"Austin-Round Rock, TX",2.9,2.3,2.6,2.1,1.8,1.7,0.9,1.3,1.5,...,1.7,1.9,0.9,1.2,0.2,0.6,3.2,2.3,1.1,1.3
6,"Baltimore-Columbia-Towson, MD2",2.2,1.6,0.2,0.5,0.8,1,0.7,0.9,1.9,...,5.4,2.6,2.6,1.7,4.2,2.1,2.1,1.4,2.8,1.6
7,"Baton Rouge, LA",2.6,2,1.1,1.3,0.4,0.8,2.7,2.1,2.7,...,2.0,2.1,2.3,2.8,1.1,1.9,2.2,2.5,0.9,1.7
8,"Birmingham-Hoover, AL",2.1,1.8,0.2,0.5,0.6,0.9,0.5,0.8,2.3,...,1.9,1.9,3.1,2.6,0.9,1.3,1.7,1.9,3.5,2.9
9,"Boston-Cambridge-Newton, MA-NH3",0.4,0.5,0.3,0.4,0.2,0.3,0.5,0.5,1.0,...,0.4,0.5,0.4,0.5,1.0,0.8,1.8,1,1.1,0.8


In [7]:
# create a slice of the dataframe without the margins of error
hv_quarters_df = homeowner_vacancy_df[['Metropolitan Statistical Area', 
                                    'Q1 2020','Q2 2020','Q3 2020','Q4 2020',  
                                    'Q1 2019','Q2 2019','Q3 2019','Q4 2019',  
                                    'Q1 2018','Q2 2018','Q3 2018','Q4 2018',  
                                    'Q1 2017','Q2 2017','Q3 2017','Q4 2017',  
                                    'Q1 2016','Q2 2016','Q3 2016','Q4 2016',  
                                    'Q1 2015','Q2 2015','Q3 2015','Q4 2015']]
hv_quarters_df.head()

Unnamed: 0,Metropolitan Statistical Area,Q1 2020,Q2 2020,Q3 2020,Q4 2020,Q1 2019,Q2 2019,Q3 2019,Q4 2019,Q1 2018,...,Q3 2017,Q4 2017,Q1 2016,Q2 2016,Q3 2016,Q4 2016,Q1 2015,Q2 2015,Q3 2015,Q4 2015
0,"Akron, OH",0.5,0.5,0.5,,0.0,1.3,1.5,2.3,0.5,...,0.0,1.1,0.0,0.4,1.2,0.5,0.0,0.5,0.8,0.5
1,"Albany-Schenectady-Troy, NY",0.9,0.9,5.3,4.9,1.6,4.8,2.1,3.8,1.8,...,1.8,2.9,3.1,2.6,0.4,2.2,1.3,2.8,1.8,1.1
2,"Albuquerque, NM",2.0,1.6,1.0,0.7,1.9,1.4,1.8,2.3,1.9,...,2.6,2.1,1.9,1.9,2.4,1.5,2.5,3.9,2.5,2.1
3,"Allentown-Bethlehem-Easton, PA-NJ",0.6,0.5,0.5,1.3,1.2,0.4,3.3,0.9,0.0,...,1.6,1.6,0.4,0.4,2.0,1.8,0.5,2.5,2.1,2.0
4,"Atlanta-Sandy Springs-Roswell, GA1",1.2,1.0,0.3,0.8,0.9,1.6,1.6,1.2,1.2,...,1.2,1.3,1.6,1.6,1.8,1.5,2.9,2.4,1.8,1.7


## Homeownership Rates for the 75 Largest MSAs (2015 - 2020)

In [8]:
# import and clean dataset
homeownership_df = pd.read_csv(interim_homeownership_file)
homeownership_df["Metropolitan Statistical Area"] = homeownership_df["Metropolitan Statistical Area"
                                                                      ].str.replace('[\.]+','')
homeownership_df["Metropolitan Statistical Area"] = homeownership_df["Metropolitan Statistical Area"
                                                                      ].str.replace('[\…]+','')
homeownership_df = homeownership_df.replace('(z)', np.NaN, regex=False)
homeownership_df = homeownership_df.rename(columns=lambda x: re.sub('First[\s]*Quarter','Q1',x))
homeownership_df = homeownership_df.rename(columns=lambda x: re.sub('Second[\s]*Quarter','Q2',x))
homeownership_df = homeownership_df.rename(columns=lambda x: re.sub('Third[\s]*Quarter','Q3',x))
homeownership_df = homeownership_df.rename(columns=lambda x: re.sub('Fourth[\s]*Quarter','Q4',x))
homeownership_df = homeownership_df.rename(columns=lambda x: re.sub('[\s]+',' ',x))
homeownership_df

Unnamed: 0,Metropolitan Statistical Area,Q1 2020,Margin of Error1,Q2 2020,Margin of Error1.1,Q3 2020,Margin of Error1.2,Q4 2020,Margin of Error1.3,Q1 2019,...,Q4 2016,Margin of Error1.18,Q1 2015,Margin of Error3,Q2 2015,Margin of Error3.1,Q3 2015,Margin of Error3.2,Q4 2015,Margin of Error3.3
0,"Akron, OH",71.9,9.7,71.7,9.9,68.5,10.1,66.4,9.6,68.3,...,73.2,10.2,68.6,8.7,73.9,8.0,79.9,7.3,73.4,8.4
1,"Albany-Schenectady-Troy, NY",62.9,9.7,57.7,9.6,62.3,9.5,73.1,9.2,60.9,...,60.9,9.2,65.9,8.2,63.5,7.7,66.6,7.5,67.5,7.4
2,"Albuquerque, NM",70.9,4.4,77.2,4.1,66.9,4.7,63.1,4.7,72.1,...,64.7,4.8,63.8,4.6,60.5,4.8,65.9,4.7,67.4,4.7
3,"Allentown-Bethlehem-Easton, PA-NJ",73.1,8.7,69.3,9.5,61.4,9.6,71.9,9.6,70.5,...,68.4,10.2,69.2,7.7,72.7,8.0,71.5,8.0,63.5,8.6
4,"Atlanta-Sandy Springs-Roswell, GA1",63.7,3.8,68.3,3.6,66.6,3.6,67.0,3.7,62.6,...,61.6,3.9,60.5,3.3,59.9,3.2,60.9,3.2,65.5,3.1
5,"Austin-Round Rock, TX",60.2,6.8,66.3,6.5,73.4,6.0,61.6,6.6,58.8,...,55.7,6.7,57.5,5.7,59.0,5.7,56.8,6.0,56.9,5.6
6,"Baltimore-Columbia-Towson, MD2",68.3,5.3,71.0,5.1,70.6,5.2,72.8,5.1,62.4,...,68.6,5.4,62.0,4.9,64.6,4.8,66.6,4.6,67.2,4.5
7,"Baton Rouge, LA",68.4,6.3,75.4,5.8,72.9,6.2,71.6,6.4,66.1,...,64.1,6.9,62.7,8.8,61.1,8.4,64.8,7.9,68.4,8.1
8,"Birmingham-Hoover, AL",73.7,5.9,77.3,5.5,75.9,5.6,77.1,5.6,63.0,...,66.0,6.5,68.7,7.1,72.5,6.5,75.5,6.5,68.1,7.3
9,"Boston-Cambridge-Newton, MA-NH3",62.8,3.5,61.5,3.5,60.7,3.5,59.7,3.5,60.9,...,60.6,3.6,60.5,3.5,58.2,3.5,58.7,3.4,59.5,3.5


In [9]:
# create a slice of the dataframe without the margins of error
h_quarters_df = homeownership_df[['Metropolitan Statistical Area', 
                                'Q1 2020','Q2 2020','Q3 2020','Q4 2020',  
                                'Q1 2019','Q2 2019','Q3 2019','Q4 2019',  
                                'Q1 2018','Q2 2018','Q3 2018','Q4 2018',  
                                'Q1 2017','Q2 2017','Q3 2017','Q4 2017',  
                                'Q1 2016','Q2 2016','Q3 2016','Q4 2016',  
                                'Q1 2015','Q2 2015','Q3 2015','Q4 2015']]

# COVID-19 Case Surveillance Public Use Data (CDC Data)

In [10]:
# import dataset
covid_cases_df = pd.read_csv(raw_covid_cases_file)
covid_cases_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,cdc_case_earliest_dt,cdc_report_dt,pos_spec_dt,onset_dt,current_status,sex,age_group,race_ethnicity_combined,hosp_yn,icu_yn,death_yn,medcond_yn
0,2020/01/01,2021/01/31,2020/01/01,,Laboratory-confirmed case,Female,0 - 9 Years,Unknown,Missing,Missing,No,Missing
1,2020/01/01,2021/02/02,2020/01/01,,Laboratory-confirmed case,Male,0 - 9 Years,Unknown,Missing,Missing,No,Missing
2,2020/01/02,2021/01/27,2020/01/02,,Laboratory-confirmed case,Male,0 - 9 Years,Unknown,Missing,Missing,No,Missing
3,2020/01/02,2021/02/02,2020/01/02,,Laboratory-confirmed case,Male,0 - 9 Years,Unknown,Missing,Missing,No,Missing
4,2021/01/01,2020/01/03,2020/01/03,2021/01/01,Laboratory-confirmed case,Male,0 - 9 Years,"White, Non-Hispanic",No,Missing,No,Yes
