In [None]:
import numpy as np
import pandas as pd
from matplotlib import pylab as plt

# This notebook matches the ASEC codes on the BLS codes actually used

In [None]:
# paths
data = "../data/"
data_out = '../results/data_out/'

Read in data: 1) occupations in BLS, 2) occupations in ASEC, 3) crosswalk

In [None]:
bls_used = pd.read_csv(data_out + 'occ_names_bls_minor_major.csv', index_col=0, usecols=[0,1,2,3])
asec_used = pd.read_csv(data + 'occ_names_employment_asec_occ.csv')
online = pd.read_excel(data + '2010-occ-codes-with-crosswalk-from-2002-2011.xls', skiprows=4, usecols="B:D").dropna()

In [None]:
online['2010 SOC Code'] = online['2010 SOC Code'].astype(str)
online['2010 SOC Code'] = online['2010 SOC Code'].str.strip()
online['2010 SOC Code zero'] = [x[:-1] + '0' for x in online['2010 SOC Code']]
online = online.merge(bls_used[['OCC_CODE']], left_on='2010 SOC Code', 
                      right_on='OCC_CODE', how='left')
online = online.merge(bls_used[['OCC_CODE']], left_on='2010 SOC Code zero', 
                      right_on='OCC_CODE', how='left', suffixes=('', '_y'))
online.OCC_CODE = online.OCC_CODE.fillna(online.OCC_CODE_y)
online.drop(['2010 SOC Code zero', 'OCC_CODE_y'], inplace=True, axis=1)

In [None]:
occ_impute = pd.read_csv(data + 'soc_imputation.csv')
online = online.merge(occ_impute, on='2010 SOC Code', how='left')
online.OCC_CODE = online.OCC_CODE.fillna(online.impute)
online.drop(['impute'], inplace=True, axis=1)

In [None]:
online['2010 Census Code'] = pd.to_numeric(online['2010 Census Code'], errors='coerce')
online = online.merge(asec_used[['Code']], left_on='2010 Census Code', right_on='Code',how='left')
asec_impute = pd.read_csv(data + 'asec_imputation.csv')
online = online.merge(asec_impute, on='2010 Census Code', how='left')
online.Code = online.Code.fillna(online.imputation)
online.drop(['imputation'], inplace=True, axis=1)

In [None]:
# all codes in crosswalk, except fishing/hunting workers and military
print(set(online['2010 Census Code']) - set(asec_used.Code))
print(set(asec_used.Code) - set(online['2010 Census Code']))
print(set(online['OCC_CODE']) - set(bls_used.OCC_CODE))
print(set(bls_used.OCC_CODE) - set(online['OCC_CODE']))

In [None]:
online.dropna(inplace=True)

In [None]:
online

In [None]:
# all codes in crosswalk, except fishing/hunting workers and military
print(set(online.Code) - set(asec_used.Code))
print(set(asec_used.Code) - set(online.Code))
print(set(online['OCC_CODE']) - set(bls_used.OCC_CODE))
print(set(bls_used.OCC_CODE) - set(online['OCC_CODE']))

In [None]:
online[['OCC_CODE', 'Code']].drop_duplicates().to_csv(data_out + 'edgelist_bls_asec_2011-2019.csv')

In [None]:
xwalk = online[['OCC_CODE', 'Code']].drop_duplicates().copy()
xwalk.Code = xwalk.Code.astype(int)
xwalk['val'] = 1
xwalk=xwalk.pivot_table(index='OCC_CODE', columns='Code', values='val').fillna(0)

In [None]:
xwalk

### proportional fitting

In [None]:
bls_emp = bls_used[['OCC_CODE', 'TOT_EMP']].set_index('OCC_CODE')
xwalk_asec_bls = xwalk.multiply(bls_emp['TOT_EMP'], axis=0)
xwalk_asec_bls = xwalk_asec_bls.div(xwalk_asec_bls.sum())
xwalk_asec_bls.to_csv(data_out + 'xwalk_asec_bls_2011-2019.csv')

In [None]:
asec_emp = asec_used[['Code', 'EMP_2011_2019_avg']].set_index('Code')
xwalk_bls_asec = xwalk.multiply(asec_emp['EMP_2011_2019_avg'], axis=1)
xwalk_bls_asec = xwalk_bls_asec.div(xwalk_bls_asec.sum(axis=1), axis=0)

In [None]:
xwalk_bls_asec.to_csv(data_out + 'xwalk_bls_asec_2011-2019.csv')

#### Assert sums equal one

In [None]:
np.testing.assert_array_almost_equal(xwalk_bls_asec.sum(axis=1).values, np.ones(len(xwalk_bls_asec.index)))

In [None]:
np.testing.assert_array_equal(xwalk_asec_bls.sum(axis=0).values, np.ones(len(xwalk_asec_bls.columns)))