## Data Wrangling FRED Data for Zipcode
from FRED public data

In [1]:
#Import pandas, matplotlib.pyplot, and seaborn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

In [2]:
#change directory to get data
path= '/Users/josephfrasca/Coding_Stuff/Springboard/Capstone_2/data/raw'
os.chdir(path)

In [3]:
#load fred national economic data data
df_fred = pd.read_csv('NationalfredgraphForZipcodes.csv')

In [4]:
df_fred

Unnamed: 0,DATE,INTDSRUSM193N,MEHOINUSA672N,SPPOPGROWUSA,UNRATE,HOUST,TLRESCONS
0,1948-01-01,.,.,.,3.7500000000000000,.,.
1,1949-01-01,.,.,.,6.0500000000000000,.,.
2,1950-01-01,1.5908333333333333,.,.,5.2083333333333333,.,.
3,1951-01-01,1.7500000000000000,.,.,3.2833333333333333,.,.
4,1952-01-01,1.7500000000000000,.,.,3.0250000000000000,.,.
...,...,...,...,...,...,...,...
68,2016-01-01,1.02083333333333333333,62898,0.716669413429853,4.8750000000000000,1176.5833333333333333,485822.500000000000
69,2017-01-01,1.6250000000000000,63761,0.631007893230758,4.3416666666666667,1207.4166666666666667,546020.166666666667
70,2018-01-01,2.4583333333333333,64324,0.522337357899676,3.8916666666666667,1248.2500000000000000,564448.750000000000
71,2019-01-01,2.7500000000000000,68703,0.473953539373292,3.6666666666666667,1295.2500000000000000,550386.500000000000


### Data Definition

In [5]:
df_fred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73 entries, 0 to 72
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   DATE           73 non-null     object
 1   INTDSRUSM193N  73 non-null     object
 2   MEHOINUSA672N  73 non-null     object
 3   SPPOPGROWUSA   73 non-null     object
 4   UNRATE         73 non-null     object
 5   HOUST          73 non-null     object
 6   TLRESCONS      73 non-null     object
dtypes: object(7)
memory usage: 4.1+ KB


### Data Cleaning

In [6]:
#filter for data after 2011 and reset index
df_fred_2011_2019 = df_fred[df_fred['DATE'] > '2011-01']
df_fred_2011_2019 = df_fred_2011_2019.reset_index(drop=True)

In [7]:
#drop empty 2020 row
df_fred_2011_2019 = df_fred_2011_2019.drop(9)

In [8]:
df_fred_2011_2019.dtypes

DATE             object
INTDSRUSM193N    object
MEHOINUSA672N    object
SPPOPGROWUSA     object
UNRATE           object
HOUST            object
TLRESCONS        object
dtype: object

In [9]:
#change dtypes to floats for economic data
date = df_fred_2011_2019['DATE']
floats = df_fred_2011_2019.drop('DATE', axis=1)
floats = floats.astype('float')
floats.dtypes

INTDSRUSM193N    float64
MEHOINUSA672N    float64
SPPOPGROWUSA     float64
UNRATE           float64
HOUST            float64
TLRESCONS        float64
dtype: object

In [10]:
df = pd.concat([date, floats], axis=1)
df

Unnamed: 0,DATE,INTDSRUSM193N,MEHOINUSA672N,SPPOPGROWUSA,UNRATE,HOUST,TLRESCONS
0,2011-01-01,0.75,57021.0,0.720018,8.933333,611.916667,255208.583333
1,2012-01-01,0.75,56912.0,0.727269,8.075,783.75,278995.583333
2,2013-01-01,0.75,58904.0,0.686773,7.358333,928.166667,335207.333333
3,2014-01-01,0.75,58001.0,0.727518,6.158333,1000.25,382868.333333
4,2015-01-01,0.770833,60987.0,0.730641,5.275,1106.75,438118.333333
5,2016-01-01,1.020833,62898.0,0.716669,4.875,1176.583333,485822.5
6,2017-01-01,1.625,63761.0,0.631008,4.341667,1207.416667,546020.166667
7,2018-01-01,2.458333,64324.0,0.522337,3.891667,1248.25,564448.75
8,2019-01-01,2.75,68703.0,0.473954,3.666667,1295.25,550386.5


In [11]:
df.nunique()

DATE             9
INTDSRUSM193N    6
MEHOINUSA672N    9
SPPOPGROWUSA     9
UNRATE           9
HOUST            9
TLRESCONS        9
dtype: int64

In [12]:
#rename columns
df = df.rename(columns = {'SPPOPGROWUSA':'uspop_growth', 'MEHOINUSA672N':'med_hIncome', 'UNRATE':'unemplt_rate', 'INTDSRUSM193N':'int_rate', 'HOUST':'newHouse_starts', 'TLRESCONS':'resConstruct_spending'})
df

Unnamed: 0,DATE,int_rate,med_hIncome,uspop_growth,unemplt_rate,newHouse_starts,resConstruct_spending
0,2011-01-01,0.75,57021.0,0.720018,8.933333,611.916667,255208.583333
1,2012-01-01,0.75,56912.0,0.727269,8.075,783.75,278995.583333
2,2013-01-01,0.75,58904.0,0.686773,7.358333,928.166667,335207.333333
3,2014-01-01,0.75,58001.0,0.727518,6.158333,1000.25,382868.333333
4,2015-01-01,0.770833,60987.0,0.730641,5.275,1106.75,438118.333333
5,2016-01-01,1.020833,62898.0,0.716669,4.875,1176.583333,485822.5
6,2017-01-01,1.625,63761.0,0.631008,4.341667,1207.416667,546020.166667
7,2018-01-01,2.458333,64324.0,0.522337,3.891667,1248.25,564448.75
8,2019-01-01,2.75,68703.0,0.473954,3.666667,1295.25,550386.5


### Save Data

In [13]:
df.to_csv(r'/Users/josephfrasca/Coding_Stuff/Springboard/Capstone_2/data/interim/Annual_fredData_2011_2019', index=False)

### Notes


### Summary

The original plan here was to use this national econmetric data to help predict vacancy rate by zipcode
- inspected and cleaned FRED national annual economic data
- filtered for data after 2011 and reset index
- dropped the empty 2020 row
- changed dtypes to floats and renamed columns