In [24]:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from pathlib import Path

# directory and file paths
data_dir = Path("../data/")
raw_data_dir = data_dir / "raw"
processed_data_dir = data_dir / "processed"
interim_data_dir = data_dir / "interim"

raw_rental_vacancy_file = raw_data_dir / "rental_vacancy_rates_75.csv"
raw_homeowner_vacancy_file = raw_data_dir / "homeowner_vacancy_rates_75.csv"
raw_homeownership_file = raw_data_dir / "homeownership_rates_75.csv"
raw_covid_cases_file = raw_data_dir / "COVID-19_Case_Surveillance_Public_Use_Data.csv"

interim_rental_vacancy_file = interim_data_dir / "rental_vacancy_rates_75.csv"
interim_homeowner_vacancy_file = interim_data_dir / "homeowner_vacancy_rates_75.csv"
interim_homeownership_file = interim_data_dir / "homeownership_rates_75.csv"

# Housing Vacancies and Homeownership (Census Data)

## Rental Vacancy Rates for the 75 Largest MSAs (2015 - 2020)

In [25]:
# import dataset
rental_vacancy_df = pd.read_csv(interim_rental_vacancy_file)
rental_vacancy_df.head()

Unnamed: 0,Metropolitan Statistical Area,First Quarter 2020,Margin of Error1,Second Quarter 2020,Margin of Error1.1,Third Quarter 2020,Margin of Error1.2,Fourth Quarter 2020,Margin of Error1.3,First Quarter 2019,...,Fourth Quarter 2016,Margin of Error1.18,First Quarter 2015,Margin of Error3,Second Quarter 2015,Margin of Error3.1,Third Quarter 2015,Margin of Error3.2,Fourth Quarter 2015,Margin of Error3.3
0,"Akron, OH .......................................",10.2,9.2,8.1,8.5,3.3,5.3,3.6,5.0,2.0,...,5.2,8.0,3.5,5.1,14.6,9.7,19.9,11.9,10.3,8.9
1,"Albany-Schenectady-Troy, NY .....................",7.9,6.7,6.7,5.6,10.1,7.1,15.3,10.2,7.9,...,3.6,4.6,9.6,6.8,7.9,5.7,5.6,5.1,3.0,3.9
2,"Albuquerque, NM..................................",4.3,2.8,6.9,4.0,6.8,3.4,4.1,2.5,7.1,...,11.6,4.3,7.3,3.3,7.7,3.3,7.4,3.6,6.4,3.5
3,"Allentown-Bethlehem-Easton, PA-NJ................",4.5,5.9,5.6,6.6,2.9,4.2,2.5,4.9,7.7,...,4.6,6.7,3.5,4.6,3.4,5.1,6.0,6.3,2.4,3.7
4,"Atlanta-Sandy Springs-Roswell, GA1...............",6.9,2.5,5.1,2.3,5.8,2.4,7.7,2.7,7.8,...,6.4,2.6,9.5,2.5,8.4,2.3,7.9,2.3,6.8,2.3


## Homeowner Vacancy Rates for the 75 Largest MSAs (2015 - 2020)

In [26]:
# import dataset
homeowner_vacancy_df = pd.read_csv(interim_homeowner_vacancy_file)
homeowner_vacancy_df.head()

Unnamed: 0,Metropolitan Statistical Area,First Quarter 2020,Margin of Error1,Second Quarter 2020,Margin of Error1.1,Third Quarter 2020,Margin of Error1.2,Fourth Quarter 2020,Margin of Error1.3,First Quarter 2019,...,Fourth Quarter 2016,Margin of Error1.18,First Quarter 2015,Margin of Error3,Second Quarter 2015,Margin of Error3.1,Third Quarter 2015,Margin of Error3.2,Fourth Quarter 2015,Margin of Error3.3
0,"Akron, OH .......................................",0.5,1.4,0.5,1.4,0.5,1.5,(z),(z),0.0,...,0.5,1.6,0.0,(z),0.5,1.3,0.8,1.5,0.5,1.3
1,"Albany-Schenectady-Troy, NY .....................",0.9,1.9,0.9,1.9,5.3,4.2,4.9,4,1.6,...,2.2,2.9,1.3,2,2.8,2.7,1.8,2.1,1.1,1.7
2,"Albuquerque, NM..................................",2.0,1.3,1.6,1.1,1.0,1.0,0.7,0.8,1.9,...,1.5,1.3,2.5,1.6,3.9,2.0,2.5,1.6,2.1,1.5
3,"Allentown-Bethlehem-Easton, PA-NJ................",0.6,1.4,0.5,1.4,0.5,1.4,1.3,2.2,1.2,...,1.8,2.9,0.5,1.2,2.5,2.7,2.1,2.5,2.0,2.6
4,"Atlanta-Sandy Springs-Roswell, GA1...............",1.2,0.8,1.0,0.7,0.3,0.4,0.8,0.7,0.9,...,1.5,1.0,2.9,1.2,2.4,1.1,1.8,0.9,1.7,0.9


## Homeownership Rates for the 75 Largest MSAs (2015 - 2020)

In [27]:
# import dataset
homeownership_df = pd.read_csv(interim_homeownership_file)
homeownership_df.head()

Unnamed: 0,Metropolitan Statistical Area,First Quarter 2020,Margin of Error1,Second Quarter 2020,Margin of Error1.1,Third Quarter 2020,Margin of Error1.2,Fourth Quarter 2020,Margin of Error1.3,First Quarter 2019,...,Fourth Quarter 2016,Margin of Error1.18,First Quarter 2015,Margin of Error3,Second Quarter 2015,Margin of Error3.1,Third Quarter 2015,Margin of Error3.2,Fourth Quarter 2015,Margin of Error3.3
0,"Akron, OH .......................................",71.9,9.7,71.7,9.9,68.5,10.1,66.4,9.6,68.3,...,73.2,10.2,68.6,8.7,73.9,8.0,79.9,7.3,73.4,8.4
1,"Albany-Schenectady-Troy, NY .....................",62.9,9.7,57.7,9.6,62.3,9.5,73.1,9.2,60.9,...,60.9,9.2,65.9,8.2,63.5,7.7,66.6,7.5,67.5,7.4
2,"Albuquerque, NM..................................",70.9,4.4,77.2,4.1,66.9,4.7,63.1,4.7,72.1,...,64.7,4.8,63.8,4.6,60.5,4.8,65.9,4.7,67.4,4.7
3,"Allentown-Bethlehem-Easton, PA-NJ................",73.1,8.7,69.3,9.5,61.4,9.6,71.9,9.6,70.5,...,68.4,10.2,69.2,7.7,72.7,8.0,71.5,8.0,63.5,8.6
4,"Atlanta-Sandy Springs-Roswell, GA1...............",63.7,3.8,68.3,3.6,66.6,3.6,67.0,3.7,62.6,...,61.6,3.9,60.5,3.3,59.9,3.2,60.9,3.2,65.5,3.1


# COVID-19 Case Surveillance Public Use Data (CDC Data)

In [12]:
# import dataset
covid_cases_df = pd.read_csv(raw_covid_cases_file)
covid_cases_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,cdc_case_earliest_dt,cdc_report_dt,pos_spec_dt,onset_dt,current_status,sex,age_group,race_ethnicity_combined,hosp_yn,icu_yn,death_yn,medcond_yn
0,2020/01/01,2021/01/31,2020/01/01,,Laboratory-confirmed case,Female,0 - 9 Years,Unknown,Missing,Missing,No,Missing
1,2020/01/01,2021/02/02,2020/01/01,,Laboratory-confirmed case,Male,0 - 9 Years,Unknown,Missing,Missing,No,Missing
2,2020/01/02,2021/01/27,2020/01/02,,Laboratory-confirmed case,Male,0 - 9 Years,Unknown,Missing,Missing,No,Missing
3,2020/01/02,2021/02/02,2020/01/02,,Laboratory-confirmed case,Male,0 - 9 Years,Unknown,Missing,Missing,No,Missing
4,2021/01/01,2020/01/03,2020/01/03,2021/01/01,Laboratory-confirmed case,Male,0 - 9 Years,"White, Non-Hispanic",No,Missing,No,Yes
