In [None]:
import numpy as np
import pandas as pd

# Data read and clean

In [None]:
edgelist_asec = pd.read_csv('../data/omn/edgelist_qualitycontrol_2011_2019.csv')
xwalk = pd.read_csv('../results/data_out/xwalk_asec_bls_2011-2019.csv', index_col=0)

In [None]:
xwalk.columns = xwalk.columns.astype(int)

In [None]:
# the edgelist misses 2960 (misc media and comm equipment workers), which maps on 
# 27-4090 (employment 2018: 18,790)), which we change to 2900 (broadcast and sound engineering technicians and radio), 
# which also maps on 27-4010 (employment 2018: 121,890)

# the crosswalk misses 6100 (fishing workers) and 6110 (hunters), which we remove

print(set(xwalk.columns) - set(edgelist_asec.OCC))
print(set(edgelist_asec.OCC) - set(xwalk.columns))

print(xwalk.loc['27-4090', 2960])
print(xwalk.loc['27-4010', 2960])

xwalk.loc['27-4090', 2900] = 18790 / (18790 + 121890)
xwalk.loc['27-4010', 2900] = 121890 / (18790 + 121890)

xwalk.drop(2960, axis=1, inplace=True)

display(edgelist_asec[edgelist_asec.OCC == 6100].head())
display(edgelist_asec[edgelist_asec.OCC == 6110])

edgelist_asec = edgelist_asec[edgelist_asec.OCC != 6100]
edgelist_asec = edgelist_asec[edgelist_asec.OCC != 6110]


In [None]:
print(set(xwalk.columns) - set(edgelist_asec.OCC))
print(set(edgelist_asec.OCC) - set(xwalk.columns))

print(xwalk.loc['27-4090', 2900])
print(xwalk.loc['27-4010', 2900])

In [None]:
# 7330 matches both on 49-9041 (employment 2018: 362,440) and 49-9045 (employment 
# 2018: 1,120) larger than 1, but it seems to be a rounding error. Ignore
print(xwalk.loc[:, xwalk.sum() > 1.000].columns)

print(xwalk.loc['49-9041', 7330])
print(xwalk.loc['49-9045', 7330])

In [None]:
# check solar PV installers

# IPUMS staff: Thank you for bringing this to our 
# attention. It does appear that these labels were inadvertently 
# switched. In the 2011-2019 samples, OCC = 6530 should refer to 
# structural iron and steel workers while OCC = 6540 are solar 
# photovoltaic installers. I have alerted the CPS team to this 
# issue so that it can be addressed.

# should show incoming links into solar pv installers
display(edgelist_asec[edgelist_asec.OCC == 6540])

# solar pv installers are in the BLS: 47-2230
print(xwalk.loc['47-2230', 6540]) #should be 1
print(xwalk.loc['47-2230', 6530]) #should be 0

# all good

# Crosswalk

In [None]:
# we want to croswalk both OCCLY and OCC to BLS format
edgelist_asec.head()

In [None]:
# make edgelist from crosswalk
xedge = xwalk.unstack()
xedge = xedge[xedge > 0]

xedge = xedge.reset_index()

xedge.rename(columns={0: 'frac'}, inplace=True)

xedge.head()

In [None]:
# Merge crosswalk on edgelist
edgelist_bls = edgelist_asec.merge(xedge, left_on = 'OCCLY', right_on='level_0', how='outer').drop('level_0', axis=1)
edgelist_bls.rename(columns={'OCC_CODE': 'OCCLY_BLS'}, inplace=True)

edgelist_bls = edgelist_bls.merge(xedge, left_on = 'OCC', right_on='level_0', how='outer').drop('level_0', axis=1)
edgelist_bls.rename(columns={'OCC_CODE': 'OCC_BLS'}, inplace=True)

In [None]:
# some occupations are split into fractions
edgelist_bls['frac'] = edgelist_bls['frac_x'] * edgelist_bls['frac_y']

# the weight and the number of observations are multiplied with fractions from both
# occ and occly
to_mult_frac = ['ASECWT', 'Observations', 'transition_adj_networkers']
edgelist_bls[to_mult_frac] = edgelist_bls[to_mult_frac].T.mul(edgelist_bls['frac']).T

# empoccly_... and empocc_... are only multiplier with their respective fractions
to_mult_occly = ['EMPOCCLY_unadj']
to_mult_occ = ['EMPOCC_unadj', 'EMPOCC_incimp']
edgelist_bls[to_mult_occly] = edgelist_bls[to_mult_occly].T.mul(edgelist_bls['frac_x']).T
edgelist_bls[to_mult_occ] = edgelist_bls[to_mult_occ].T.mul(edgelist_bls['frac_y']).T

In [None]:
# Other times, multiple asec occupations merge on one bls
print(len(edgelist_bls[edgelist_bls[['OCCLY_BLS', 'OCC_BLS']].duplicated(keep=False)]))

# We sum the emplocc(ly) totals of those
tot_emp_asec = edgelist_bls[['OCC', 'EMPOCC_unadj', 'EMPOCC_incimp', 'OCC_BLS']].drop_duplicates(['OCC','OCC_BLS'])
tot_emp_asec = tot_emp_asec.drop('OCC', axis=1).groupby('OCC_BLS').sum()
tot_emply_asec = edgelist_bls[['OCCLY', 'EMPOCCLY_unadj', 'OCCLY_BLS']].drop_duplicates(['OCCLY','OCCLY_BLS'])
tot_emply_asec = tot_emply_asec.drop('OCCLY', axis=1).groupby('OCCLY_BLS').sum()

In [None]:
# We groupby occly_bls and occ_bls to find those that match on multiple
# asec occupations. We sum the number of observations, the weights, and
# the fractions, and we concatenate the strings
edgelist_bls[['OCCLY', 'OCC']]  = edgelist_bls[['OCCLY', 'OCC']].astype(str)
edgelist_bls = edgelist_bls.groupby(['OCCLY_BLS', 'OCC_BLS']).\
                                            agg(OCCLY = ('OCCLY', lambda x: ','.join(x)),
                                                   OCC = ('OCC', lambda x: ','.join(x)),
                                                   ASECWT = ('ASECWT', 'sum'),
                                                   transition_adj_networkers = ('transition_adj_networkers', 'sum'),
                                                   Observations = ('Observations', 'sum'),
                                                   OCCLY_label = ('OCCLY_label', lambda x: ','.join(x)),
                                                   OCC_label = ('OCC_label', lambda x: ','.join(x)),
                                                   frac_x = ('frac_x', 'sum'),
                                                   frac_y = ('frac_y', 'sum'),
                                                   frac = ('frac', 'sum'))

edgelist_bls = edgelist_bls.reset_index()

# merge the empocc_ totals back on
edgelist_bls = edgelist_bls.merge(tot_emp_asec, left_on='OCC_BLS', right_index=True, how='left')
edgelist_bls = edgelist_bls.merge(tot_emply_asec, left_on='OCCLY_BLS', right_index=True, how='left')

In [None]:
# read in the bls occupation names
#bls_name = pd.read_csv('../results/data_out/scenario/annual_change/employment_effects_95% by 2035_2021-2022__3_oct_2022.csv', index_col=0, usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14])
bls_name = pd.read_csv('../results/data_out/occ_names_bls_minor_major.csv', index_col=0).set_index('OCC_CODE')

In [None]:
# add the names to the df with appropriate headings
bls_name_OCCLY = bls_name.copy()
new_n = [n + '_OCCLY' for n in bls_name_OCCLY.columns]
bls_name_OCCLY.set_axis(new_n, axis=1, inplace=True)

bls_name_OCC = bls_name.copy()
new_n = [n + '_OCC' for n in bls_name_OCC.columns]
bls_name_OCC.set_axis(new_n, axis=1, inplace=True)

# merge
edgelist_bls = edgelist_bls.merge(bls_name_OCC, left_on='OCC_BLS', right_index=True, how='outer')
edgelist_bls = edgelist_bls.merge(bls_name_OCCLY, left_on='OCCLY_BLS', right_index=True, how='outer')

In [None]:
mp = {'ASECWT': 'ASECWT_BLS', 'transition_adj_networkers': 'transition_adj_networkers_BLS',
      'Observations': 'Observations_BLS',  'EMPOCCLY_unadj': 'EMPOCCLY_unadj_BLS', 
      'EMPOCC_unadj': 'EMPOCC_unadj_BLS', 'EMPOCC_incimp': 'EMPOCC_incimp_BLS'}
edgelist_bls.rename(mp, axis=1, inplace=True)

In [None]:
edgelist_bls

In [None]:
# reorder and save files
edgelist_bls = edgelist_bls[[
    'OCCLY_BLS', 'OCC_BLS', 'ASECWT_BLS', 'transition_adj_networkers_BLS', 
    'Observations_BLS', 'OCC_TITLE_OCC', 'OCC_TITLE_OCCLY', 
    'EMPOCCLY_unadj_BLS', 'EMPOCC_unadj_BLS', 'EMPOCC_incimp_BLS',
    'TOT_EMP_OCC', 'A_MEAN_OCC', 'class_minor_OCC', 'class_major_OCC',
    'OCC_CODE_minor_OCC', 'OCC_TITLE_minor_OCC', 'TOT_EMP_minor_OCC',
    'A_MEAN_minor_OCC', 'OCC_CODE_major_OCC', 'OCC_TITLE_major_OCC',
    'TOT_EMP_major_OCC', 'A_MEAN_major_OCC',
    'TOT_EMP_OCCLY', 'A_MEAN_OCCLY', 'class_minor_OCCLY', 'class_major_OCCLY',
    'OCC_CODE_minor_OCCLY', 'OCC_TITLE_minor_OCCLY', 'TOT_EMP_minor_OCCLY',
    'A_MEAN_minor_OCCLY', 'OCC_CODE_major_OCCLY', 'OCC_TITLE_major_OCCLY',
    'TOT_EMP_major_OCCLY', 'A_MEAN_major_OCCLY',
    'OCCLY', 'OCC', 'OCCLY_label', 'OCC_label', 'frac_x', 'frac_y', 'frac'
]]

edgelist_bls.rename(
    {
        'OCCLY': 'OCCLY_ASEC',
        'OCC': 'OCC_ASEC',
        'OCCLY_label': 'OCCLY_label_ASEC',
        'OCC_label': 'OCC_label_ASEC'
    },
    axis=1,
    inplace=True)

edgelist_bls.to_csv('../data/omn/jb/helperdoc_edgelist_qualitycontrol_2011_2019_bls-9feb.csv')



edgelist_bls = edgelist_bls[[
    'OCCLY_BLS', 'OCC_BLS', 'ASECWT_BLS', 'transition_adj_networkers_BLS', 
    'Observations_BLS', 'OCC_TITLE_OCC', 'OCC_TITLE_OCCLY',
    'EMPOCCLY_unadj_BLS', 'EMPOCC_unadj_BLS', 'EMPOCC_incimp_BLS', 'TOT_EMP_OCC',
    'A_MEAN_OCC', 'TOT_EMP_OCCLY', 'A_MEAN_OCCLY',
    'OCCLY_ASEC', 'OCC_ASEC', 'OCCLY_label_ASEC',
    'OCC_label_ASEC'
]]

edgelist_bls.to_csv('../data/omn/jb/edgelist_qualitycontrol_2011_2019_bls-9feb.csv')

# Employment file

In [None]:
emp_asec = pd.read_csv('../data/omn/emp_asec_2011_2019_includeimputations.csv', index_col=0)

In [None]:
# We remove 6100/6110 from emp_asec
print(set(emp_asec.index) - set(xwalk.columns))
print(set(xwalk.columns) - set(emp_asec.index))

emp_asec.drop([6100, 6110], inplace=True)

In [None]:
emp_bls = pd.concat([xwalk @ emp_asec.ASECWT, xwalk @ emp_asec.ASECWT_average_years], axis=1)

In [None]:
emp_bls.columns = emp_asec.columns + '_BLS'

In [None]:
emp_bls.to_csv('../data/omn/jb/emp_asec_2011_2019_includeimputations_bls-9feb.csv')