In [1]:
import os
import pandas as pd
import numpy as np

## Join Zip and MSA Data

### Load Shaped Zip data into DataFrame

In [2]:
filename = '../../data/interim/shaped_zip.csv'
df = pd.read_csv(filename)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1109656 entries, 0 to 1109655
Data columns (total 8 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   year     1109656 non-null  int64  
 1   zipcode  1109656 non-null  int64  
 2   state    824710 non-null   object 
 3   EQI      824710 non-null   float64
 4   SFR      824710 non-null   float64
 5   growth   824710 non-null   float64
 6   RECPI    824710 non-null   float64
 7   REAI     703797 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 67.7+ MB


##### Reindex zip data by zipcode and year

In [3]:
df = df.set_index(['zipcode', 'year'])
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1109656 entries, (1001, 1988) to (99626, 2016)
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   state   824710 non-null  object 
 1   EQI     824710 non-null  float64
 2   SFR     824710 non-null  float64
 3   growth  824710 non-null  float64
 4   RECPI   824710 non-null  float64
 5   REAI    703797 non-null  float64
dtypes: float64(5), object(1)
memory usage: 56.4+ MB


### Load MSA data into DataFrame

In [4]:
filename = '../../data/interim/msa-data-with-zipcode.csv'
msa = pd.read_csv(filename)
msa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 688481 entries, 0 to 688480
Data columns (total 10 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   zipcode  688481 non-null  int64  
 1   CBSA     688481 non-null  int64  
 2   MSA      688481 non-null  object 
 3   state    688481 non-null  object 
 4   year     688481 non-null  int64  
 5   growth   688481 non-null  int64  
 6   SFR      688481 non-null  int64  
 7   EQI      688481 non-null  float64
 8   RECPI    688481 non-null  float64
 9   REAI     595961 non-null  float64
dtypes: float64(3), int64(5), object(2)
memory usage: 52.5+ MB


##### Reindex MSA data by zipcode and year

In [5]:
msa = msa.set_index(['zipcode', 'year'])
msa.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 688481 entries, (1001, 1988) to (57031, 1991)
Data columns (total 8 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   CBSA    688481 non-null  int64  
 1   MSA     688481 non-null  object 
 2   state   688481 non-null  object 
 3   growth  688481 non-null  int64  
 4   SFR     688481 non-null  int64  
 5   EQI     688481 non-null  float64
 6   RECPI   688481 non-null  float64
 7   REAI    595961 non-null  float64
dtypes: float64(3), int64(3), object(2)
memory usage: 44.2+ MB


### Join DataFrames

In [6]:
df = df.join(msa, on=['zipcode', 'year'], lsuffix='_zip', rsuffix='_MSA', how='outer')
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1109656 entries, (1001, 1988) to (99626, 2016)
Data columns (total 14 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   state_zip   824710 non-null  object 
 1   EQI_zip     824710 non-null  float64
 2   SFR_zip     824710 non-null  float64
 3   growth_zip  824710 non-null  float64
 4   RECPI_zip   824710 non-null  float64
 5   REAI_zip    703797 non-null  float64
 6   CBSA        688481 non-null  float64
 7   MSA         688481 non-null  object 
 8   state_MSA   688481 non-null  object 
 9   growth_MSA  688481 non-null  float64
 10  SFR_MSA     688481 non-null  float64
 11  EQI_MSA     688481 non-null  float64
 12  RECPI_MSA   688481 non-null  float64
 13  REAI_MSA    595961 non-null  float64
dtypes: float64(11), object(3)
memory usage: 124.1+ MB


### Clean DataFrame

##### Reset Index

In [7]:
df = df.reset_index()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1109656 entries, 0 to 1109655
Data columns (total 16 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   zipcode     1109656 non-null  int64  
 1   year        1109656 non-null  int64  
 2   state_zip   824710 non-null   object 
 3   EQI_zip     824710 non-null   float64
 4   SFR_zip     824710 non-null   float64
 5   growth_zip  824710 non-null   float64
 6   RECPI_zip   824710 non-null   float64
 7   REAI_zip    703797 non-null   float64
 8   CBSA        688481 non-null   float64
 9   MSA         688481 non-null   object 
 10  state_MSA   688481 non-null   object 
 11  growth_MSA  688481 non-null   float64
 12  SFR_MSA     688481 non-null   float64
 13  EQI_MSA     688481 non-null   float64
 14  RECPI_MSA   688481 non-null   float64
 15  REAI_MSA    595961 non-null   float64
dtypes: float64(11), int64(2), object(3)
memory usage: 135.5+ MB


##### Extract state columns and combine

In [8]:
state_zip = df[['state_zip']]
state_zip.columns = ['state']

In [9]:
state_MSA = df[['state_MSA']]
state_MSA.columns = ['state']

In [10]:
state = state_zip.combine_first(state_MSA)
state.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1109656 entries, 0 to 1109655
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   state   944005 non-null  object
dtypes: object(1)
memory usage: 8.5+ MB


##### Reinsert state column

In [11]:
df = df.join(state)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1109656 entries, 0 to 1109655
Data columns (total 17 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   zipcode     1109656 non-null  int64  
 1   year        1109656 non-null  int64  
 2   state_zip   824710 non-null   object 
 3   EQI_zip     824710 non-null   float64
 4   SFR_zip     824710 non-null   float64
 5   growth_zip  824710 non-null   float64
 6   RECPI_zip   824710 non-null   float64
 7   REAI_zip    703797 non-null   float64
 8   CBSA        688481 non-null   float64
 9   MSA         688481 non-null   object 
 10  state_MSA   688481 non-null   object 
 11  growth_MSA  688481 non-null   float64
 12  SFR_MSA     688481 non-null   float64
 13  EQI_MSA     688481 non-null   float64
 14  RECPI_MSA   688481 non-null   float64
 15  REAI_MSA    595961 non-null   float64
 16  state       944005 non-null   object 
dtypes: float64(11), int64(2), object(4)
memory usage: 143.9+ MB


##### Drop unnecessary columns

In [12]:
df = df.drop(['state_zip','state_MSA'], axis=1)

##### Reorder columns

In [13]:
df = df[['year','zipcode','MSA','CBSA','state','EQI_zip','SFR_zip','growth_zip','RECPI_zip','REAI_zip','EQI_MSA','SFR_MSA','growth_MSA','RECPI_MSA','REAI_MSA']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1109656 entries, 0 to 1109655
Data columns (total 15 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   year        1109656 non-null  int64  
 1   zipcode     1109656 non-null  int64  
 2   MSA         688481 non-null   object 
 3   CBSA        688481 non-null   float64
 4   state       944005 non-null   object 
 5   EQI_zip     824710 non-null   float64
 6   SFR_zip     824710 non-null   float64
 7   growth_zip  824710 non-null   float64
 8   RECPI_zip   824710 non-null   float64
 9   REAI_zip    703797 non-null   float64
 10  EQI_MSA     688481 non-null   float64
 11  SFR_MSA     688481 non-null   float64
 12  growth_MSA  688481 non-null   float64
 13  RECPI_MSA   688481 non-null   float64
 14  REAI_MSA    595961 non-null   float64
dtypes: float64(11), int64(2), object(2)
memory usage: 127.0+ MB


### Output DataFrame to file

In [14]:
filename = '../../data/interim/1.2-output.csv'
df.to_csv(filename, index=False)