In [18]:
import os
import pandas as pd
import numpy as np

## Join Zip and MSA Data

### Load Zip data into DataFrame

In [19]:
filename = '../../data/external/zip_data.tab'
df = pd.read_csv(filename, sep='\t', lineterminator='\n')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 824770 entries, 0 to 824769
Data columns (total 8 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   zipcode  824770 non-null  int64  
 1   year     824770 non-null  int64  
 2   state    824770 non-null  object 
 3   EQI      824770 non-null  float64
 4   SFR      824770 non-null  int64  
 5   growth   824770 non-null  int64  
 6   RECPI    824770 non-null  float64
 7   REAI     703845 non-null  float64
dtypes: float64(3), int64(4), object(1)
memory usage: 50.3+ MB


##### Reindex zip data by zipcode and year

In [20]:
df = df.set_index(['zipcode', 'year'])
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 824770 entries, (1001, 1988) to (99929, 2016)
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   state   824770 non-null  object 
 1   EQI     824770 non-null  float64
 2   SFR     824770 non-null  int64  
 3   growth  824770 non-null  int64  
 4   RECPI   824770 non-null  float64
 5   REAI    703845 non-null  float64
dtypes: float64(3), int64(2), object(1)
memory usage: 42.0+ MB


### Load MSA data into DataFrame

In [21]:
filename = '../../data/interim/msa-data-with-zipcode.csv'
msa = pd.read_csv(filename)
msa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 688481 entries, 0 to 688480
Data columns (total 10 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   zipcode  688481 non-null  int64  
 1   CBSA     688481 non-null  int64  
 2   MSA      688481 non-null  object 
 3   state    688481 non-null  object 
 4   year     688481 non-null  int64  
 5   growth   688481 non-null  int64  
 6   SFR      688481 non-null  int64  
 7   EQI      688481 non-null  float64
 8   RECPI    688481 non-null  float64
 9   REAI     595961 non-null  float64
dtypes: float64(3), int64(5), object(2)
memory usage: 52.5+ MB


##### Reindex MSA data by zipcode and year

In [22]:
msa = msa.set_index(['zipcode', 'year'])
msa.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 688481 entries, (1001, 1988) to (57031, 1991)
Data columns (total 8 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   CBSA    688481 non-null  int64  
 1   MSA     688481 non-null  object 
 2   state   688481 non-null  object 
 3   growth  688481 non-null  int64  
 4   SFR     688481 non-null  int64  
 5   EQI     688481 non-null  float64
 6   RECPI   688481 non-null  float64
 7   REAI    595961 non-null  float64
dtypes: float64(3), int64(3), object(2)
memory usage: 44.2+ MB


### Join DataFrames

In [23]:
df = df.join(msa, on=['zipcode', 'year'], lsuffix='_zip', rsuffix='_MSA', how='outer')
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 944065 entries, (1001, 1988) to (57031, 1991)
Data columns (total 14 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   state_zip   824770 non-null  object 
 1   EQI_zip     824770 non-null  float64
 2   SFR_zip     824770 non-null  float64
 3   growth_zip  824770 non-null  float64
 4   RECPI_zip   824770 non-null  float64
 5   REAI_zip    703845 non-null  float64
 6   CBSA        688518 non-null  float64
 7   MSA         688518 non-null  object 
 8   state_MSA   688518 non-null  object 
 9   growth_MSA  688518 non-null  float64
 10  SFR_MSA     688518 non-null  float64
 11  EQI_MSA     688518 non-null  float64
 12  RECPI_MSA   688518 non-null  float64
 13  REAI_MSA    595992 non-null  float64
dtypes: float64(11), object(3)
memory usage: 105.6+ MB


### Clean DataFrame

##### Reset Index

In [24]:
df = df.reset_index()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 944065 entries, 0 to 944064
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   zipcode     944065 non-null  int64  
 1   year        944065 non-null  int64  
 2   state_zip   824770 non-null  object 
 3   EQI_zip     824770 non-null  float64
 4   SFR_zip     824770 non-null  float64
 5   growth_zip  824770 non-null  float64
 6   RECPI_zip   824770 non-null  float64
 7   REAI_zip    703845 non-null  float64
 8   CBSA        688518 non-null  float64
 9   MSA         688518 non-null  object 
 10  state_MSA   688518 non-null  object 
 11  growth_MSA  688518 non-null  float64
 12  SFR_MSA     688518 non-null  float64
 13  EQI_MSA     688518 non-null  float64
 14  RECPI_MSA   688518 non-null  float64
 15  REAI_MSA    595992 non-null  float64
dtypes: float64(11), int64(2), object(3)
memory usage: 115.2+ MB


##### Reorder columns

In [25]:
df = df[['year','zipcode','MSA','CBSA','state_zip','state_MSA','EQI_zip','SFR_zip','growth_zip','RECPI_zip','REAI_zip','EQI_MSA','SFR_MSA','growth_MSA','RECPI_MSA','REAI_MSA']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 944065 entries, 0 to 944064
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   year        944065 non-null  int64  
 1   zipcode     944065 non-null  int64  
 2   MSA         688518 non-null  object 
 3   CBSA        688518 non-null  float64
 4   state_zip   824770 non-null  object 
 5   state_MSA   688518 non-null  object 
 6   EQI_zip     824770 non-null  float64
 7   SFR_zip     824770 non-null  float64
 8   growth_zip  824770 non-null  float64
 9   RECPI_zip   824770 non-null  float64
 10  REAI_zip    703845 non-null  float64
 11  EQI_MSA     688518 non-null  float64
 12  SFR_MSA     688518 non-null  float64
 13  growth_MSA  688518 non-null  float64
 14  RECPI_MSA   688518 non-null  float64
 15  REAI_MSA    595992 non-null  float64
dtypes: float64(11), int64(2), object(3)
memory usage: 115.2+ MB


### Output DataFrame to file

In [26]:
filename = '../../data/interim/1.2-output.csv'
df.to_csv(filename, index=False)