In [1]:
import numpy as np
import pandas as pd
import json

# Setup

In [2]:
with open('../../config/config.json', 'r') as fh:
    params = json.load(fh)

In [3]:
zbp_detail_by_year = {}
zbp_totals_by_year = {}
for year in params['years']:
    shortened_year = str(year)[2:]
    detail_encoding = None
    totals_encoding = None
    if year >= 2017:
        detail_encoding = 'latin-1'
    if year >= 2018:
        totals_encoding = 'latin-1'
    zbp_detail_by_year[year] = pd.read_csv(f'../../src/data/raw/zbp_data/zbpdetail/zbp{shortened_year}detail/zbp{shortened_year}detail.txt', encoding=detail_encoding)
    zbp_totals_by_year[year] = pd.read_csv(f'../../src/data/raw/zbp_data/zbptotals/zbp{shortened_year}totals/zbp{shortened_year}totals.txt', encoding=totals_encoding)

# ZBP Detail

In [4]:
def is_2dig_naics(naics_code):
    return naics_code[:2].isnumeric() and not any(char.isdigit() for char in naics_code[2:])

def process_zbp_data(data, year):
    cols = ['zip', 'naics', 'est']
    if year <= 2016:
        cols += ['n1_4']
    else:
        cols += ['n<5']
    cols += ['n5_9', 'n10_19', 'n20_49', 'n50_99', 'n100_249', 'n250_499', 'n500_999', 'n1000']
    
    # filter and standardize col names
    data = data[cols]
    data = data.rename(columns={'n<5':'n1_4'})
    # filter only relavent zip codes
    data = data[data['zip'].apply(lambda x: x in params['zip_codes'])]
    # keep only 2dig naics
    data = data[data['naics'].apply(is_2dig_naics)]
    data['naics'] = data['naics'].apply(lambda x: x[:2])
    # assign year variable
    data = data.assign(year=np.full(data.shape[0], year))
    
    return data

In [5]:
for year in zbp_detail_by_year:
    zbp_detail_by_year[year] = process_zbp_data(zbp_detail_by_year[year], year)
    
zbp_detail_data = pd.concat(list(zbp_detail_by_year.values()), ignore_index=True).reset_index(drop=True)
zbp_detail_data.to_csv('../../src/data/temp/processed_zbp_detail_data.csv', index=False)
zbp_detail_data.head()

Unnamed: 0,zip,naics,est,n1_4,n5_9,n10_19,n20_49,n50_99,n100_249,n250_499,n500_999,n1000,year
0,91901,23,88,68,7,6,5,1,0,1,0,0,2012
1,91901,31,2,1,1,0,0,0,0,0,0,0,2012
2,91901,42,9,9,0,0,0,0,0,0,0,0,2012
3,91901,44,67,25,21,16,4,1,0,0,0,0,2012
4,91901,48,8,8,0,0,0,0,0,0,0,0,2012


# ZBP Totals

In [6]:
def process_zbp_totals(data, year):
    # drop naming columns
    cols = ['name', 'city', 'stabbr', 'cty_name']
    if year <= 2017:
        cols += ['empflag']
    data = data.drop(columns=cols)
    # filter only relavent zip codes
    data = data[data['zip'].apply(lambda x: x in params['zip_codes'])]
    # assign year variable
    data = data.assign(year=np.full(data.shape[0], year))
    return data

In [7]:
for year in zbp_totals_by_year:
    zbp_totals_by_year[year] = process_zbp_totals(zbp_totals_by_year[year], year)
    
zbp_totals_data = pd.concat(list(zbp_totals_by_year.values()), ignore_index=True).reset_index(drop=True)
zbp_totals_data.to_csv('../../src/data/temp/processed_zbp_totals_data.csv', index=False)
zbp_totals_data.head()

Unnamed: 0,zip,emp_nf,emp,qp1_nf,qp1,ap_nf,ap,est,year
0,91901,H,4141,H,36304,H,174786,391,2012
1,91902,G,2265,G,19111,G,81569,349,2012
2,91903,S,0,G,123,G,491,13,2012
3,91905,G,19,S,0,H,748,9,2012
4,91906,D,0,D,0,D,0,27,2012


In [13]:
for year in zbp_totals_data['year'].unique():
    print(zbp_totals_data[zbp_totals_data['year']==year]['zip'].nunique())

175
174
174
175
175
167
165
167
167
166


array([2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021])