In [95]:
import pandas as pd
import numpy as np
from us_state_abbrev import us_state_abbrev
import glob

In [96]:
path = r"./data/vacancydata" 
files = glob.glob(path + "/*.csv")
years = ['2013','2014','2015','2016','2017','2018','2019']
li=[]

for (f,y) in zip(files,years):
    df = pd.read_csv(f, index_col=None, header=0)
    df = df.loc[1:] #removes unnecessary column description
    df['Year'] = y
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)
framec = frame.copy()

In [97]:
framec.rename(columns={
    'DP04_0001E':'Total Housing Units','DP04_0002E':'Occupied Units', 'DP04_0003E':'Vacant Units',
    'DP04_0004E':'Homeowner Vacancy Rate','DP04_0005E':'Rental Vacancy Rate','NAME':'County'}, inplace=True)

In [98]:
framec.columns

Index(['GEO_ID', 'County', 'Total Housing Units', 'DP04_0001M', 'DP04_0001PE',
       'DP04_0001PM', 'Occupied Units', 'DP04_0002M', 'DP04_0002PE',
       'DP04_0002PM',
       ...
       'DP04_0141PM', 'Year', 'DP04_0142E', 'DP04_0142M', 'DP04_0142PE',
       'DP04_0142PM', 'DP04_0143E', 'DP04_0143M', 'DP04_0143PE',
       'DP04_0143PM'],
      dtype='object', length=575)

In [99]:
vacantdf = framec.loc[:,['Year','County','Total Housing Units','Occupied Units','Vacant Units','Rental Vacancy Rate','Homeowner Vacancy Rate']]

## fixing 'County' format

In [100]:
vacantdf['county']=vacantdf['County'].str.split(',',1,expand=True)[0]
vacantdf['state'] = vacantdf['County'].str.split(',',1,expand=True)[1]
vacantdf['state'] = vacantdf['state'].map(lambda x: x.strip())
vacantdf['state'] = vacantdf['state'].map(us_state_abbrev)
vacantdf['County'] = vacantdf['County'].map(lambda county: county[:(county.find('County') - 1)] if 'County'\
                                              in county else (county[:(county.find('Municipio') - 1)] if 'Municipio'\
                                                             in county else county))
vacantdf['County'] = vacantdf['County'].map(lambda county: county[:county.find(',')] if county.find(',') != -1 else county)
vacantdf.drop('county',axis=1,inplace=True)

In [101]:
vacantdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5832 entries, 0 to 5831
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Year                    5832 non-null   object
 1   County                  5832 non-null   object
 2   Total Housing Units     5832 non-null   object
 3   Occupied Units          5832 non-null   object
 4   Vacant Units            5832 non-null   object
 5   Rental Vacancy Rate     5832 non-null   object
 6   Homeowner Vacancy Rate  5832 non-null   object
 7   state                   5825 non-null   object
dtypes: object(8)
memory usage: 364.6+ KB


### Rental Vacancy Rate small sample size issue

The Rental vacancy rate value for some of the counties is an 'N' which
indicates that data for this geographic area cannot be displayed because the number of sample cases is too small
so I decided to drop them *(21 rows dropped)*

In [102]:
vacantdf = vacantdf[vacantdf['Rental Vacancy Rate'] != 'N']

### Changing column data to appropriate Dtype

In [103]:
vacantdf.loc[:, 'Year'] = vacantdf.loc[:, 'Year'].map(int)
vacantdf.loc[:, 'Total Housing Units'] = vacantdf.loc[:, 'Total Housing Units'].map(lambda x: int(x))
vacantdf.loc[:, 'Occupied Units'] = vacantdf.loc[:, 'Occupied Units'].map(lambda x: int(x))
vacantdf.loc[:, 'Vacant Units'] = vacantdf.loc[:, 'Vacant Units'].map(lambda x: int(x))
vacantdf.loc[:, 'Rental Vacancy Rate'] = vacantdf.loc[:, 'Rental Vacancy Rate'].apply(lambda x: float(x))
vacantdf.loc[:, 'Homeowner Vacancy Rate'] = vacantdf.loc[:, 'Homeowner Vacancy Rate'].apply(lambda x: float(x))

In [104]:
vacantdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5811 entries, 0 to 5831
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Year                    5811 non-null   int64  
 1   County                  5811 non-null   object 
 2   Total Housing Units     5811 non-null   int64  
 3   Occupied Units          5811 non-null   int64  
 4   Vacant Units            5811 non-null   int64  
 5   Rental Vacancy Rate     5811 non-null   float64
 6   Homeowner Vacancy Rate  5811 non-null   float64
 7   state                   5804 non-null   object 
dtypes: float64(2), int64(4), object(2)
memory usage: 408.6+ KB


In [107]:
new_vacant = vacantdf.loc[:,['Year','County','state','Rental Vacancy Rate']]

In [110]:
new_vacant.to_csv('./data/vacancy.csv')