In [None]:
import numpy as np
import pandas as pd
import json

In [None]:
with open('../../config/config.json', 'r') as fh:
    params = json.load(fh)

zbp_by_year = {}
for year in params['years']:
    shortened_year = str(year)[2:]
    if year <= 2018:
        zbp_by_year[year] = pd.read_csv(f'../../src/data/raw/zbp_data/zbp{shortened_year}detail/zbp{shortened_year}detail.txt')
    else:
        zbp_by_year[year] = pd.read_csv(f'../../src/data/raw/zbp_data/zbp{shortened_year}detail/zbp{shortened_year}detail.txt', encoding='latin-1')

In [None]:
def is_2dig_naics(naics_code):
    return naics_code[:2].isnumeric() and not any(char.isdigit() for char in naics_code[2:])

def process_zbp_data(data, year):
    cols = ['zip', 'naics', 'est']
    if year <= 2016:
        cols += ['n1_4']
    else:
        cols += ['n<5']
    cols += ['n5_9', 'n10_19', 'n20_49', 'n50_99', 'n100_249', 'n250_499', 'n500_999', 'n1000']
    
    # filter and standardize col names
    data = data[cols]
    data = data.rename(columns={'n<5':'n1_4'})
    # filter only relavent zip codes
    data = data[data['zip'].apply(lambda x: x in params['zip_codes'])]
    # keep only 2dig naics
    data = data[data['naics'].apply(is_2dig_naics)]
    data['naics'] = data['naics'].apply(lambda x: x[:2])
    # assign year variable
    data = data.assign(year=np.full(data.shape[0], year))
    
    return data

In [None]:
for year in zbp_by_year:
    zbp_by_year[year] = process_zbp_data(zbp_by_year[year], year)
    
zbp_data = pd.concat(list(zbp_by_year.values()), ignore_index=True).reset_index()
zbp_data.to_csv('../../src/data/temp/processed_zbp_data.csv', index=False)
zbp_data.head()