In [None]:
import numpy as np
import pandas as pd
import json

# Setup

In [None]:
with open('../../config/config.json', 'r') as fh:
    params = json.load(fh)

In [None]:
zbp_detail_by_year = {}
zbp_totals_by_year = {}
for year in params['years']:
    shortened_year = str(year)[2:]
    detail_encoding = None
    totals_encoding = None
    if year >= 2017:
        detail_encoding = 'latin-1'
    if year >= 2018:
        totals_encoding = 'latin-1'
    zbp_detail_by_year[year] = pd.read_csv(f'../../src/data/raw/zbp_data/zbpdetail/zbp{shortened_year}detail/zbp{shortened_year}detail.txt', encoding=detail_encoding)
    zbp_totals_by_year[year] = pd.read_csv(f'../../src/data/raw/zbp_data/zbptotals/zbp{shortened_year}totals/zbp{shortened_year}totals.txt', encoding=totals_encoding)

# ZBP Detail

In [None]:
def is_2dig_naics(naics_code):
    return naics_code[:2].isnumeric() and not any(char.isdigit() for char in naics_code[2:])

def process_zbp_data(data, year):
    cols = ['zip', 'naics', 'est']
    if year <= 2016:
        cols += ['n1_4']
    else:
        cols += ['n<5']
    cols += ['n5_9', 'n10_19', 'n20_49', 'n50_99', 'n100_249', 'n250_499', 'n500_999', 'n1000']
    
    # filter and standardize col names
    data = data[cols]
    data = data.rename(columns={'n<5':'n1_4'})
    # filter only relavent zip codes
    data = data[data['zip'].apply(lambda x: x in params['zip_codes'])]
    # keep only 2dig naics
    data = data[data['naics'].apply(is_2dig_naics)]
    data['naics'] = data['naics'].apply(lambda x: x[:2])
    # assign year variable
    data = data.assign(year=np.full(data.shape[0], year))
    
    return data

In [None]:
for year in zbp_detail_by_year:
    zbp_detail_by_year[year] = process_zbp_data(zbp_detail_by_year[year], year)
    
zbp_detail_data = pd.concat(list(zbp_detail_by_year.values()), ignore_index=True).reset_index(drop=True)
zbp_detail_data.to_csv('../../src/data/temp/processed_zbp_detail_data.csv', index=False)
zbp_detail_data.head()

# ZBP Totals

In [None]:
def process_zbp_totals(data, year):
    # drop naming columns
    cols = ['name', 'city', 'stabbr', 'cty_name']
    if year <= 2017:
        cols += ['empflag']
    data = data.drop(columns=cols)
    # filter only relavent zip codes
    data = data[data['zip'].apply(lambda x: x in params['zip_codes'])]
    # assign year variable
    data = data.assign(year=np.full(data.shape[0], year))
    return data

In [None]:
for year in zbp_totals_by_year:
    zbp_totals_by_year[year] = process_zbp_totals(zbp_totals_by_year[year], year)
    
zbp_totals_data = pd.concat(list(zbp_totals_by_year.values()), ignore_index=True).reset_index(drop=True)
zbp_totals_data.to_csv('../../src/data/temp/processed_zbp_totals_data.csv', index=False)
zbp_totals_data.head()