## Data Wrangling - Zipcode Aggregation of All Datas
from FRED public data and Zillow public data

In [1]:
#Import pandas, matplotlib.pyplot, and seaborn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

In [2]:
#change directory to get data
path= '/Users/josephfrasca/Coding_Stuff/Springboard/Capstone_2/data/interim'
os.chdir(path)

In [3]:
os.listdir()

['.gitkeep',
 '.ipynb_checkpoints',
 'Annual_fredData_2011_2019',
 'df3_1956',
 'df_homePrices_2011_2018',
 'df_homePrices_2011_2020',
 'df_homePrices_2011_2020.csv',
 'df_rentPrices_2014_2020',
 'df_rentPrices_2014_2020.csv',
 'VacancyRate_Zipcode_2011_2018',
 'VacancyRate_Zipcode_2011_2018.csv']

In [4]:
#load rental data
df_fred = pd.read_csv('Annual_fredData_2011_2019')
df_homes = pd.read_csv("df_homePrices_2011_2020.csv", converters={'RegionName': lambda x: str(x)})
df_rents = pd.read_csv('df_rentPrices_2014_2020.csv', dtype={'RegionID': object, 'RegionName': object})
df_vacancy = pd.read_csv('VacancyRate_Zipcode_2011_2018.csv', dtype={'Zipcode': object})

### Data Definition

In [5]:
df_vacancy

Unnamed: 0,Zipcode,Vacancy_Rate%,MOE-VacancyRate%,Year
0,02333,3.024027,2.199925,2011
1,02338,3.116343,2.948791,2011
2,02339,4.464646,2.066438,2011
3,02341,3.586322,2.340722,2011
4,02343,3.732901,2.926524,2011
...,...,...,...,...
264955,98279,51.219512,10.993457,2018
264956,98280,51.329243,12.777549,2018
264957,98311,6.540162,1.960476,2018
264958,98326,28.537736,14.679524,2018


In [6]:
df_homes

Unnamed: 0,SizeRank,RegionName,State,City,Metro,CountyName,HomePrice,Year
0,0,10025,NY,New York,New York-Newark-Jersey City,New York County,784477.42,2011
1,1,60657,IL,Chicago,Chicago-Naperville-Elgin,Cook County,400695.83,2011
2,2,10023,NY,New York,New York-Newark-Jersey City,New York County,821072.58,2011
3,3,77494,TX,Katy,Houston-The Woodlands-Sugar Land,Harris County,265425.17,2011
4,4,60614,IL,Chicago,Chicago-Naperville-Elgin,Cook County,507804.42,2011
...,...,...,...,...,...,...,...,...
303665,34430,43523,OH,Malinta,,Henry County,84331.11,2020
303666,34430,7703,NJ,Eatontown,New York-Newark-Jersey City,Monmouth County,579932.56,2020
303667,34430,47865,IN,Carlisle,Terre Haute,Sullivan County,42896.56,2020
303668,35187,822,LA,Choudrant,Ruston,Lincoln Parish,195888.67,2020


### Data Cleaning 1 - HomePrices

In [7]:
#add leading zeros to zipcode in home dataframe
df_homes['RegionName'] = df_homes['RegionName'].apply(lambda x: '{0:0>5}'.format(x))

In [34]:
df_homes

Unnamed: 0,SizeRank,Zipcode,State,City,Metro,CountyName,HomePrice,Year
0,0,10025,NY,New York,New York-Newark-Jersey City,New York County,784477.42,2011
1,1,60657,IL,Chicago,Chicago-Naperville-Elgin,Cook County,400695.83,2011
2,2,10023,NY,New York,New York-Newark-Jersey City,New York County,821072.58,2011
3,3,77494,TX,Katy,Houston-The Woodlands-Sugar Land,Harris County,265425.17,2011
4,4,60614,IL,Chicago,Chicago-Naperville-Elgin,Cook County,507804.42,2011
...,...,...,...,...,...,...,...,...
303665,34430,43523,OH,Malinta,,Henry County,84331.11,2020
303666,34430,07703,NJ,Eatontown,New York-Newark-Jersey City,Monmouth County,579932.56,2020
303667,34430,47865,IN,Carlisle,Terre Haute,Sullivan County,42896.56,2020
303668,35187,00822,LA,Choudrant,Ruston,Lincoln Parish,195888.67,2020


In [9]:
#rename Zipcode column
df_homes = df_homes.rename(columns = {'RegionName':'Zipcode'})
#check for NaNs
df_homes.isna().sum()

SizeRank          0
Zipcode           0
State             0
City              0
Metro         71180
CountyName        0
HomePrice     11066
Year              0
dtype: int64

In [41]:
#subset df_homes data for year 2011-2018 (for merging with FRED data)
df_homes_2011_2018 = df_homes.loc[df_homes['Year'] < 2019]

In [42]:
#subset df_homes data for year 2014-2020 (for merging with rent data)
df_homes_2014_2020 = df_homes.loc[df_homes['Year'] > 2013]

In [48]:
df_homes_2014_2020[df_homes_2014_2020.Year == 2014]

Unnamed: 0,SizeRank,Zipcode,State,City,Metro,CountyName,HomePrice,Year
91101,0,10025,NY,New York,New York-Newark-Jersey City,New York County,968761.75,2014
91102,1,60657,IL,Chicago,Chicago-Naperville-Elgin,Cook County,450755.75,2014
91103,2,10023,NY,New York,New York-Newark-Jersey City,New York County,1024543.17,2014
91104,3,77494,TX,Katy,Houston-The Woodlands-Sugar Land,Harris County,322032.00,2014
91105,4,60614,IL,Chicago,Chicago-Naperville-Elgin,Cook County,580250.92,2014
...,...,...,...,...,...,...,...,...
121463,34430,43523,OH,Malinta,,Henry County,67959.33,2014
121464,34430,07703,NJ,Eatontown,New York-Newark-Jersey City,Monmouth County,,2014
121465,34430,47865,IN,Carlisle,Terre Haute,Sullivan County,,2014
121466,35187,00822,LA,Choudrant,Ruston,Lincoln Parish,140783.33,2014


### Data Cleaning 2 - RentPrices

In [47]:
df_rents[df_rents.Year == 2014]

Unnamed: 0,Zipcode,RentPrice,Year
0,10025,3041.83,2014
1,60657,1589.42,2014
2,10023,3186.67,2014
3,77494,1807.33,2014
4,60614,1786.25,2014
...,...,...,...
3238,02110,4250.33,2014
3239,20004,2289.80,2014
3240,80951,1254.36,2014
3241,11964,,2014


In [12]:
#add leading zeros to zipcode in rent dataframe
df_rents['RegionName'] = df_rents['RegionName'].apply(lambda x: '{0:0>5}'.format(x))

In [13]:
df_rents

Unnamed: 0,RegionName,RentPrice,Year
0,10025,3041.83,2014
1,60657,1589.42,2014
2,10023,3186.67,2014
3,77494,1807.33,2014
4,60614,1786.25,2014
...,...,...,...
22696,02110,4408.57,2020
22697,20004,2505.56,2020
22698,80951,1647.88,2020
22699,11964,15800.50,2020


In [14]:
#rename Zipcode column
df_rents = df_rents.rename(columns={'RegionName': 'Zipcode'})

In [15]:
#check for NaNs
df_rents.isna().sum()

Zipcode      0
RentPrice    3
Year         0
dtype: int64

In [16]:
#replace home price NaNs with first:
    #diff = (price of year you have) - (mean of a year you have) 
    #then replace NaN with 'diff' + (mean of NaN year)

### Data Cleaning 3 - FRED Economic Data

In [17]:
df_fred.head()

Unnamed: 0,DATE,int_rate,med_hIncome,uspop_growth,unemplt_rate,newHouse_starts,resConstruct_spending
0,2011-01-01,0.75,57021.0,0.720018,8.933333,611.916667,255208.583333
1,2012-01-01,0.75,56912.0,0.727269,8.075,783.75,278995.583333
2,2013-01-01,0.75,58904.0,0.686773,7.358333,928.166667,335207.333333
3,2014-01-01,0.75,58001.0,0.727518,6.158333,1000.25,382868.333333
4,2015-01-01,0.770833,60987.0,0.730641,5.275,1106.75,438118.333333


In [18]:
#change year to match df_vacancy format
df_fred.DATE = df_fred.DATE.str.replace('-01-01','')

In [19]:
#change DATE column name to Year
df_fred = df_fred.rename(columns = {'DATE':'Year'})

In [20]:
#subset FRED data to 2011-2018 to prep for merge
df_fred_2011_2018 = df_fred.iloc[0:8]
df_fred_2011_2018 = df_fred_2011_2018.astype({'Year': 'int32'})

In [40]:
df_fred_2011_2018.dtypes

Year                       int32
int_rate                 float64
med_hIncome              float64
uspop_growth             float64
unemplt_rate             float64
newHouse_starts          float64
resConstruct_spending    float64
dtype: object

In [22]:
df_vacancy.dtypes

Zipcode              object
Vacancy_Rate%       float64
MOE-VacancyRate%    float64
Year                  int64
dtype: object

### Data Joining

In [46]:
#merge rent prices and home prices (2014-2020)
df_rent_merged_homes = pd.merge(df_homes_2014_2020, df_rents, how='left', on=['Year', 'Zipcode'])
df_rent_merged_homes

Unnamed: 0,SizeRank,Zipcode,State,City,Metro,CountyName,HomePrice,Year,RentPrice
0,0,10025,NY,New York,New York-Newark-Jersey City,New York County,968761.75,2014,3041.83
1,1,60657,IL,Chicago,Chicago-Naperville-Elgin,Cook County,450755.75,2014,1589.42
2,2,10023,NY,New York,New York-Newark-Jersey City,New York County,1024543.17,2014,3186.67
3,3,77494,TX,Katy,Houston-The Woodlands-Sugar Land,Harris County,322032.00,2014,1807.33
4,4,60614,IL,Chicago,Chicago-Naperville-Elgin,Cook County,580250.92,2014,1786.25
...,...,...,...,...,...,...,...,...,...
212564,34430,43523,OH,Malinta,,Henry County,84331.11,2020,
212565,34430,07703,NJ,Eatontown,New York-Newark-Jersey City,Monmouth County,579932.56,2020,
212566,34430,47865,IN,Carlisle,Terre Haute,Sullivan County,42896.56,2020,
212567,35187,00822,LA,Choudrant,Ruston,Lincoln Parish,195888.67,2020,


In [24]:
#merge df_vacancy + FRED data 2011-2018
df_vacany_merged_fred = pd.merge(df_vacancy, df_fred_2011_2018, how='left', on='Year')
df_vacany_merged_fred

Unnamed: 0,Zipcode,Vacancy_Rate%,MOE-VacancyRate%,Year,int_rate,med_hIncome,uspop_growth,unemplt_rate,newHouse_starts,resConstruct_spending
0,02333,3.024027,2.199925,2011,0.750000,57021.0,0.720018,8.933333,611.916667,255208.583333
1,02338,3.116343,2.948791,2011,0.750000,57021.0,0.720018,8.933333,611.916667,255208.583333
2,02339,4.464646,2.066438,2011,0.750000,57021.0,0.720018,8.933333,611.916667,255208.583333
3,02341,3.586322,2.340722,2011,0.750000,57021.0,0.720018,8.933333,611.916667,255208.583333
4,02343,3.732901,2.926524,2011,0.750000,57021.0,0.720018,8.933333,611.916667,255208.583333
...,...,...,...,...,...,...,...,...,...,...
264955,98279,51.219512,10.993457,2018,2.458333,64324.0,0.522337,3.891667,1248.250000,564448.750000
264956,98280,51.329243,12.777549,2018,2.458333,64324.0,0.522337,3.891667,1248.250000,564448.750000
264957,98311,6.540162,1.960476,2018,2.458333,64324.0,0.522337,3.891667,1248.250000,564448.750000
264958,98326,28.537736,14.679524,2018,2.458333,64324.0,0.522337,3.891667,1248.250000,564448.750000


In [25]:
df_vacany_merged_fred_homes = pd.merge(df_homes_2011_2018, df_vacany_merged_fred, how='left', on=['Year', 'Zipcode'])
df_vacany_merged_fred_homes

Unnamed: 0,SizeRank,Zipcode,State,City,Metro,CountyName,HomePrice,Year,Vacancy_Rate%,MOE-VacancyRate%,int_rate,med_hIncome,uspop_growth,unemplt_rate,newHouse_starts,resConstruct_spending
0,0,10025,NY,New York,New York-Newark-Jersey City,New York County,784477.42,2011,8.950188,1.326307,0.750000,57021.0,0.720018,8.933333,611.916667,255208.583333
1,1,60657,IL,Chicago,Chicago-Naperville-Elgin,Cook County,400695.83,2011,8.188111,1.002378,0.750000,57021.0,0.720018,8.933333,611.916667,255208.583333
2,2,10023,NY,New York,New York-Newark-Jersey City,New York County,821072.58,2011,18.388871,2.067466,0.750000,57021.0,0.720018,8.933333,611.916667,255208.583333
3,3,77494,TX,Katy,Houston-The Woodlands-Sugar Land,Harris County,265425.17,2011,6.140681,1.962907,0.750000,57021.0,0.720018,8.933333,611.916667,255208.583333
4,4,60614,IL,Chicago,Chicago-Naperville-Elgin,Cook County,507804.42,2011,9.950141,1.562555,0.750000,57021.0,0.720018,8.933333,611.916667,255208.583333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242931,34430,43523,OH,Malinta,,Henry County,81699.58,2018,0.000000,0.000000,2.458333,64324.0,0.522337,3.891667,1248.250000,564448.750000
242932,34430,07703,NJ,Eatontown,New York-Newark-Jersey City,Monmouth County,543333.30,2018,91.457286,60.918520,2.458333,64324.0,0.522337,3.891667,1248.250000,564448.750000
242933,34430,47865,IN,Carlisle,Terre Haute,Sullivan County,40215.62,2018,17.924528,17.924528,2.458333,64324.0,0.522337,3.891667,1248.250000,564448.750000
242934,35187,00822,LA,Choudrant,Ruston,Lincoln Parish,178884.17,2018,,,,,,,,


In [26]:
df_vacany_merged_fred.isna().sum()

Zipcode                  0
Vacancy_Rate%            0
MOE-VacancyRate%         0
Year                     0
int_rate                 0
med_hIncome              0
uspop_growth             0
unemplt_rate             0
newHouse_starts          0
resConstruct_spending    0
dtype: int64

In [27]:
df_vacany_merged_fred_homes.isna().sum()

SizeRank                     0
Zipcode                      0
State                        0
City                         0
Metro                    56944
CountyName                   0
HomePrice                11066
Year                         0
Vacancy_Rate%             5032
MOE-VacancyRate%          5032
int_rate                  5032
med_hIncome               5032
uspop_growth              5032
unemplt_rate              5032
newHouse_starts           5032
resConstruct_spending     5032
dtype: int64

In [28]:
#replace home price NaNs with first:
    #diff = (price of year you have) - (mean of a year you have) 
#then with NaN do 'diff' + (mean of NaN year)

### Save Data

### Notes


### Summary