In [1]:
import os
import pandas as pd
import numpy as np

## Merge zipcode-to-cbsa.csv with MSA Data

### Load MSA data into DataFrame

In [2]:
filename = '../../data/external/msa_data.tab'
msa = pd.read_csv(filename, sep='\t', lineterminator='\n')
msa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29661 entries, 0 to 29660
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   MSA     28235 non-null  object 
 1   CBSA    29661 non-null  int64  
 2   state   29661 non-null  object 
 3   year    29661 non-null  int64  
 4   growth  29661 non-null  int64  
 5   SFR     29661 non-null  int64  
 6   EQI     29661 non-null  float64
 7   RECPI   29661 non-null  float64
 8   REAI    25707 non-null  float64
dtypes: float64(3), int64(4), object(2)
memory usage: 2.0+ MB


### Load zipcode-to-cbsa.csv into DataFrame

In [4]:
filename = '../../data/interim/zipcode-to-cbsa.csv'
z2c = pd.read_csv(filename)
z2c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28236 entries, 0 to 28235
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   zipcode  28236 non-null  int64
 1   CBSA     28236 non-null  int64
dtypes: int64(2)
memory usage: 441.3 KB


### Merge DataFrames

In [5]:
merged = pd.merge(z2c, msa, on='CBSA')
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1007659 entries, 0 to 1007658
Data columns (total 10 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   zipcode  1007659 non-null  int64  
 1   CBSA     1007659 non-null  int64  
 2   MSA      1007659 non-null  object 
 3   state    1007659 non-null  object 
 4   year     1007659 non-null  int64  
 5   growth   1007659 non-null  int64  
 6   SFR      1007659 non-null  int64  
 7   EQI      1007659 non-null  float64
 8   RECPI    1007659 non-null  float64
 9   REAI     872604 non-null   float64
dtypes: float64(3), int64(5), object(2)
memory usage: 84.6+ MB


##### Drop duplicate zipcodes

In [6]:
merged = merged.drop_duplicates(['zipcode', 'year'])
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 688481 entries, 0 to 1007635
Data columns (total 10 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   zipcode  688481 non-null  int64  
 1   CBSA     688481 non-null  int64  
 2   MSA      688481 non-null  object 
 3   state    688481 non-null  object 
 4   year     688481 non-null  int64  
 5   growth   688481 non-null  int64  
 6   SFR      688481 non-null  int64  
 7   EQI      688481 non-null  float64
 8   RECPI    688481 non-null  float64
 9   REAI     595961 non-null  float64
dtypes: float64(3), int64(5), object(2)
memory usage: 57.8+ MB


In [7]:
merged.zipcode.nunique()

23872

### Output merged DataFrame to file

In [8]:
filename = '../../data/interim/msa-data-with-zipcode.csv'
merged.to_csv(filename, index=False)