In [2]:
import numpy as np
import pandas as pd
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode = True)
import us_state_abbrev
statedict = us_state_abbrev.us_state_abbrev

## Zillow Rental Index Processing

In [2]:
zori = pd.read_csv('./data/Zip_ZORI_AllHomesPlusMultifamily_SSA.csv')
zori = pd.concat([zori['RegionName'], zori.iloc[:, 4:]], axis = 1)
zori['County'] = zori['RegionName'].apply(lambda zipcode: search.by_zipcode(zipcode).values()[5][:-7]\
                                              if search.by_zipcode(zipcode).values()[5] is not None else 'NA')
zori['State'] = zori['RegionName'].apply(lambda zipcode: search.by_zipcode(zipcode).values()[6]\
                                              if search.by_zipcode(zipcode).values()[6] is not None else 'NA')
zori = pd.concat([zori[['RegionName', 'County', 'State']], zori.iloc[:, 2:-3]], axis = 1)
zori.columns = zori.columns.str.replace('RegionName', 'ZipCode')

In [3]:
temp1 = zori.iloc[:, :3].iloc[np.arange(zori.shape[0]).repeat(zori.iloc[:, 3:].shape[1])].reset_index().drop('index',
                                                                                                             axis = 1)
temp2 = pd.melt(zori.iloc[:, 3:].T.reset_index(), id_vars = 'index').drop('variable', axis = 1)
temp2.columns = ['Date', 'ZORI']
temp2['Year'] = temp2['Date'].map(lambda date: int(date[:4]))
temp2['Month'] = temp2['Date'].map(lambda date: int(date[-2:]))
temp2['Year_Month'] = temp2['Year'].map(str) + '_' + temp2['Month'].map(str)
temp2.drop('Date', axis = 1, inplace = True)
zori = pd.concat([temp1, temp2], axis = 1)

In [58]:
zori.to_csv('./data/cleandata/clean_zori.csv')

## Zillow Home Value Index Processing

In [4]:
zhvi = pd.read_csv('./data/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv')
zhvi['CountyName'] = zhvi['CountyName'].map(lambda county: county[:-7] if county[-6:] == 'County' else county)
zhvi = pd.concat([zhvi[['RegionName', 'CountyName', 'State']], zhvi.iloc[:, 9:]], axis = 1)
zhvi.columns = zhvi.columns.str.replace('RegionName', 'ZipCode')
zhvi.columns = zhvi.columns.str.replace('CountyName', 'County')

In [5]:
temp1 = zhvi.iloc[:, :3].iloc[np.arange(zhvi.shape[0]).repeat(zhvi.iloc[:, 3:].shape[1])].reset_index().drop('index',
                                                                                                             axis = 1)
temp2 = pd.melt(zhvi.iloc[:, 3:].T.reset_index(), id_vars = 'index').drop('variable', axis = 1)
temp2.columns = ['Date', 'ZHVI']
temp2['Year'] = temp2['Date'].map(lambda date: int(date[:4]))
temp2['Month'] = temp2['Date'].map(lambda date: int(date[5:7]))
temp2['Year_Month'] = temp2['Year'].map(str) + '_' + temp2['Month'].map(str)
temp2.drop('Date', axis = 1, inplace = True)
zhvi = pd.concat([temp1, temp2], axis = 1)

In [59]:
zhvi.to_csv('./data/cleandata/clean_zhvi.csv')

## Air Quality Index Data Processing

In [6]:
def airqcompiler(start, end):
    """Compiles air quality index csv files from 'start' to 'end' into 1 clean dataframe"""
    final = pd.DataFrame()
    for i in range(start, end + 1):
        temp = pd.read_csv(f'./data/daily_aqi_by_county_{i}/daily_aqi_by_county_{i}.csv')[['county Name', 'State Name',
                                                                                    'Date', 'AQI']]
        temp['State Name'] = temp['State Name'].apply(lambda state: statedict.get(state))
        temp.columns = temp.columns.str.replace('county Name', 'County')
        temp.columns = temp.columns.str.replace('State Name', 'State')
        temp['Date'] = temp['Date'].apply(lambda date: pd.to_datetime(date, format = '%Y-%m-%d'))
        temp['Month'] = temp['Date'].apply(lambda date: date.month)
        temp['Year'] = temp['Date'].apply(lambda date: date.year)
        temp['Year_Month'] = temp['Year'].map(str) + '_' + temp['Month'].map(str)
        temp = temp.groupby(['County', 'State', 'Year', 'Month', 'Year_Month']).mean().reset_index()
        final = pd.concat([final, temp], axis = 0)
        print(f'Finished compiling year {i}.')
    return final

In [7]:
airq = airqcompiler(2014, 2020)

Finished compiling year 2014.
Finished compiling year 2015.
Finished compiling year 2016.
Finished compiling year 2017.
Finished compiling year 2018.
Finished compiling year 2019.
Finished compiling year 2020.


In [61]:
airq.to_csv('./data/cleandata/clean_airq.csv')

## Population Data Processing

In [8]:
def popcompiler(start, end):
    """Compiles population csv files from 'start' to 'end' into 1 clean dataframe"""
    final = pd.DataFrame()
    for i in range(start, end + 1):
        temp = pd.read_csv(f'./data/productDownload_2020-12-09T144241/ACSDT1Y{i}.B01003_data.csv', header = 1)
        temp = temp[['Geographic Area Name', 'Estimate!!Total']]
        temp.columns = ['County', 'Population']
        temp['State'] = temp['County'].apply(lambda county: statedict.get(county[(county.find(',') + 2):]))
        temp['County'] = temp['County'].apply(lambda county: county[:(county.find('County') - 1)] if 'County'\
                                              in county else (county[:(county.find('Municipio') - 1)] if 'Municipio'\
                                                             in county else county))
        temp['Year'] = [i for x in range(temp.shape[0])]
        final = pd.concat([final, temp], axis = 0)
    return final

In [154]:
# Clean up the population dataframe further and linearly extrapolate the yearly data into monthly
temp = popcompiler(2012, 2019)
population = pd.DataFrame()
for (_, _), df in temp.groupby(['County', 'State']): 
    df['Change'] = df['Population'].diff() / df['Population'].shift()
    df = df.iloc[np.arange(df.shape[0]).repeat(12)].reset_index().drop('index', axis = 1)
    df = pd.concat([df, pd.DataFrame(list(range(1, 13)) * int(df.shape[0] / 12), columns = ['Month'])], axis = 1)
    for year in pd.unique(df['Year'])[1:]:
        df.loc[df['Year'] == year, 'Population'] = np.array(df.loc[df['Year'] == (year - 1), 'Population']) * \
                                                   np.array((((1 + df.loc[df['Year'] == year, 'Change']) ** (1/12)) ** \
                                                               df.loc[df['Year'] == year, 'Month']))
    df = df[df['Year'] != 2012]
    df['Year_Month'] = df['Year'].map(str) + '_' + df['Month'].map(str)
    df = df[['County', 'State', 'Year', 'Month', 'Year_Month', 'Population']]
    population = pd.concat([population, df], axis = 0)

In [156]:
population.to_csv('./data/cleandata/clean_population.csv')

## Unemployment Data Processing

In [151]:
# Clean up the unemployment dataframe
unemployment = pd.read_csv('./data/US_unemployment.csv', index_col = 0)
unemployment = unemployment[~unemployment['County'].isnull()]
unemployment['County'] = unemployment['County'] + ', ' + unemployment['State']
unemployment.drop('State', axis = 1, inplace = True)
unemployment = unemployment.T
unemployment.columns = unemployment.iloc[0, :]
unemployment = unemployment.iloc[1:, :]
unemployment.reset_index(inplace = True)
unemployment.columns.name = None
unemployment['Year'] = unemployment['index'].map(lambda year: int(year[-4:]))
unemployment = pd.concat([unemployment, pd.DataFrame(list(range(1, 13)) * int(unemployment.shape[0] / 12),
                                                     columns = ['Month'])], axis = 1)
unemployment['Year_Month'] = unemployment['Year'].map(str) + '_' + unemployment['Month'].map(str)
unemployment.drop('index', axis = 1, inplace = True)
temp = pd.DataFrame()
for county in unemployment.columns[:-3].to_list():
    temp2 = unemployment[[county, 'Year', 'Month', 'Year_Month']]
    temp2 = temp2.assign(County = temp2.columns[0])
    temp2.columns = ['Unemployment', 'Year', 'Month', 'Year_Month', 'County']
    temp = pd.concat([temp, temp2], axis = 0)
unemployment = temp
unemployment['State'] = unemployment['County'].map(lambda county: county[-2:])
unemployment['County'] = unemployment['County'].map(lambda county: county[:county.find(',')])
unemployment['Unemployment'] = unemployment['Unemployment'].map(lambda x: str(x)[:str(x).find('(')]\
                                                                if str(x).find('(') != -1 else str(x))
unemployment['Unemployment'] = unemployment['Unemployment'].map(lambda x: float(x) if (x != ' ')\
                                                                & (x != 'No Data Available ') else None)

In [153]:
unemployment.to_csv('./data/cleandata/clean_unemployment.csv')

## Education Data Processing

In [11]:
education = pd.read_csv('./data/US_education.csv').drop('Unnamed: 0', axis = 1)
education = education[education['County'] != 'United States']
education['County'] = education['County'].apply(lambda county: county[:(county.find('County') - 1)] if 'County'\
                                              in county else (county[:(county.find('Municipio') - 1)] if 'Municipio'\
                                                             in county else county))
education.loc[education['State'] == 'District of Columbia', 'State'] = 'District Of Columbia'
education['State'] = education['State'].map(lambda state: statedict.get(state))

In [65]:
education.to_csv('./data/cleandata/clean_education.csv')

## Permits Data Processing

In [19]:
permits = pd.read_csv('./data/US_permits.csv').drop('Unnamed: 0', axis = 1)
month_to_number = {'January' : 1, 'February' : 2, 'March' : 3, 'April' : 4, 'May' : 5, 'June' : 6, 'July' : 7, 
                   'August' : 8, 'September' : 9, 'October' : 10, 'November' : 11, 'December' : 12}
permits['Month'] = permits['Month'].map(lambda month: month_to_number.get(month))
permits['Year_Month'] = permits['Year'].map(str) + '_' + permits['Month'].map(str)
temp = permits[permits['State'] == 'MN'].groupby(['Year', 'Month', 'Year_Month', 'State']).sum().reset_index()
permits = permits[permits['State'] != 'MN']
permits = pd.concat([permits, temp], axis = 0)
permits.reset_index().drop('index', axis = 1, inplace = True)

In [20]:
permits.to_csv('./data/cleandata/clean_permits.csv')

## Median Income and Total Households Data Processing

In [3]:
IandH = pd.read_csv('./data/income_and_households.csv').drop('Unnamed: 0', axis = 1)
IandH['County'] = IandH['County'].apply(lambda county: county[:(county.find('County') - 1)] if 'County'\
                                        in county else (county[:(county.find('Municipio') - 1)] if 'Municipio'\
                                                        in county else county))
IandH['Year_Month'] = IandH['Year'].map(str) + '_' + IandH['Month'].map(str)

In [4]:
IandH.to_csv('./data/cleandata/clean_IandH.csv')

## Inflation Data Processing

In [14]:
pce = pd.read_excel('./data/underlying-inflation-dashboard-data.xlsx', sheet_name = 'PCE')
pce['Year'] = pce['Date'].map(lambda date: date.year)
pce['Month'] = pce['Date'].map(lambda date: date.month)
pce.drop('Date', axis = 1, inplace = True)
pce['Year_Month'] = pce['Year'].map(str) + '_' + pce['Month'].map(str)
pce = pce[~pce['PCE'].isnull()]

In [68]:
pce.to_csv('./data/cleandata/clean_pce.csv')

## Vacancy Data Processing

In [11]:
vacancy = pd.read_csv('./data/vacancy.csv').drop('Unnamed: 0', axis = 1)
vacancy.columns = ['Year', 'County', 'State', 'Rental Vacancy Rate']
vacancy.to_csv('./data/cleandata/clean_vacancy.csv')

## Unemployment 