In [None]:
import numpy as np
import pandas as pd

In [None]:
# paths
data_out = "../results/data_out/"
data = '../data/'

# This notebook reads the BLS occupation - industry data and saves a bipartite adjacency matrix between the two, using the relevant industries and occupations

Employment data per sector (the interlayer links) come from BLS, which provides data on various levels. I will first show some statistics of this dataset.

In [None]:
# occuaptions at sector data 2018
folder = "BLS/oesm18in4/oesm18in4/"
filename_occ_sect = "natsector_M2018_dl.xlsx"
filename_occ_3d = "nat3d_M2018_dl.xlsx"
filename_occ_4d = "nat4d_M2018_dl.xlsx"
filename_occ_5d = "nat5d_6d_M2018_dl.xlsx"

tab = "natsector_dl"
df_occ_sect = pd.read_excel(data + folder + filename_occ_sect, sheet_name = tab, header = 0)
df_occ_3d = pd.read_excel(data + folder + filename_occ_3d, header = 0)
df_occ_4d = pd.read_excel(data + folder + filename_occ_4d, header = 0)
df_occ_5d = pd.read_excel(data + folder + filename_occ_5d, header = 0)

# occuaptions at ownership data 2018
folder = "BLS/oesm18in4/oesm18in4/"
#filename_occ_sect_owner = "national_owner_M2018_dl.xlsx"
filename_occ_3d_owner = "nat3d_owner_M2018_dl.xlsx"
filename_occ_4d_owner = "nat4d_owner_M2018_dl.xlsx"

#df_occ_sect_ow = pd.read_excel(data + folder + filename_occ_sect_owner, header = 0)
df_occ_3d_ow = pd.read_excel(data + folder + filename_occ_3d_owner, header = 0)
df_occ_4d_ow = pd.read_excel(data + folder + filename_occ_4d_owner, header = 0)

In [None]:
# Occuations at the national level for wage imputation
filename_occ_sect = "BLS/oesm18nat/national_M2018_dl.xlsx"
tab = "national_dl"
df_occ_wage = pd.read_excel(data + filename_occ_sect, sheet_name = tab, header = 0)
df_occ_wage = df_occ_wage[['OCC_CODE', 'A_MEAN']].drop_duplicates()

In [None]:
# We don't have data for most workers in 27-2000 occupations, which
# are 'Entertainers and Performers, Sports and Related Workers'
df_occ_wage.loc[df_occ_wage['A_MEAN'] == '*', :]

In [None]:
# we do have the annual wage for the higher level workers
df_occ_wage.loc[df_occ_wage['OCC_CODE'] == '27-2000', :]

In [None]:
#We impute the more detailed occupations with this
df_occ_wage.loc[df_occ_wage['A_MEAN'] == '*', 'A_MEAN'] = \
                df_occ_wage.loc[df_occ_wage['OCC_CODE'] == '27-2000', 'A_MEAN'].values

In [None]:
# We combine the data together into one list of industries used by BLS
df_occ    = pd.DataFrame(pd.concat([df_occ_sect, df_occ_3d, df_occ_4d, df_occ_5d]))
df_occ_ow = pd.DataFrame(pd.concat([df_occ_3d_ow, df_occ_4d_ow]))
df_occ_ow['NAICS'] = (df_occ_ow['NAICS'].astype(str) + \
                                        df_occ_ow['OWNERSHIP']).fillna(df_occ_ow['NAICS'])

df_occ = pd.DataFrame(pd.concat([df_occ, df_occ_ow]))

Select the data we need for the BEA industries and OMN codes

In [None]:
occupations_BLS_IPUMS = pd.read_csv(data_out + '2018_BLS_IPUMS_crosswalk_proportional.csv', header=0, index_col=0).index.to_numpy()

# add additional occupations
additional = pd.read_csv(data + 'soc_additional.csv')
additional = additional.code.to_numpy()
occupations_BLS_IPUMS = np.concatenate((occupations_BLS_IPUMS, additional))
occupations_BLS_IPUMS.sort()

# and remove a few as well
occupations_BLS_IPUMS = np.setdiff1d(occupations_BLS_IPUMS, ['11-2000', '23-1000', '39-4000'])

BLS_useful = df_occ[df_occ['OCC_CODE'].isin(occupations_BLS_IPUMS)]

In [None]:
# Occuations at the national level for naming
filename_national = "BLS/oesm18nat/national_M2018_dl.xlsx"
tab = "national_dl"
df_bls = pd.read_excel(data + filename_national, sheet_name = tab, header = 0)

# Names of all occuaptions we need
occ_names = df_bls[df_bls['OCC_CODE'].isin(occupations_BLS_IPUMS)].copy()
occ_names = occ_names[['OCC_CODE', 'OCC_TITLE', 'TOT_EMP', 'A_MEAN']]
occ_names.drop_duplicates(inplace=True)

# minor and major occupations have more trailing 0s
occ_names['class_minor'] = occ_names.OCC_CODE.str[:4] + '000'
occ_names['class_major'] = occ_names.OCC_CODE.str[:3] + '0000'

# printing workers (51-5100) does not have an aggregate occupation 51-5000
occ_names.loc[occ_names['class_minor'] == '51-5000', 'class_minor'] = '51-5100'
# Same with 15-1000
occ_names.loc[occ_names['class_minor'] == '15-1000', 'class_minor'] = '15-1100'

# Merge data on minor and major occupations
occ_names = occ_names.merge(df_bls[['OCC_CODE', 'OCC_TITLE', 'TOT_EMP', 'A_MEAN']], \
               left_on='class_minor', right_on='OCC_CODE', suffixes=('', '_minor'), how='left')
occ_names = occ_names.merge(df_bls[['OCC_CODE', 'OCC_TITLE', 'TOT_EMP', 'A_MEAN']], \
               left_on='class_major', right_on='OCC_CODE', suffixes=('', '_major'), how='left')

#No data for two entertainment ocuaptions, we set them to minor level wage
occ_names.loc[occ_names['A_MEAN'] == '*', 'A_MEAN'] = occ_names.loc[occ_names['A_MEAN'] == '*', 'A_MEAN_minor']

occ_names.to_csv(data_out + 'occ_names_bls_minor_major.csv')

In [None]:
industries_389 = pd.read_csv(data + 'Data_out_BLS_NAICS/others/BEA_389_BLS_NAICS_othermethod_strict.csv', header=0, index_col=1)['BLS_NAICS'].to_numpy()
industries_389 = np.unique(industries_389)

industries_71 = pd.read_csv(data + 'Data_out_BLS_NAICS/BEA_71_BLS_NAICS.csv', header=0, index_col=1)['BLS_NAICS'].to_numpy()
industries_71 = np.unique(industries_71)

BLS_useful['NAICS'] = BLS_useful['NAICS'].apply('{:0<6}'.format)
BLS_useful_389 = BLS_useful[BLS_useful['NAICS'].isin(industries_389.astype(str))]
BLS_useful_71 = BLS_useful[BLS_useful['NAICS'].isin(industries_71.astype(str))]

In [None]:
def imputewage(row):
    if row.A_MEAN == '*':
        return df_occ_wage.loc[df_occ_wage['OCC_CODE'] == row.OCC_CODE, 'A_MEAN'].values[0]
    else:
        return row.A_MEAN

In [None]:
# impute mean wage for occupation if specific wage for industry-occupation pair is not known
BLS_useful_71['A_MEAN'] = BLS_useful_71.apply(lambda row: imputewage(row), axis=1)
BLS_useful_389['A_MEAN'] = BLS_useful_389.apply(lambda row: imputewage(row), axis=1)

In [None]:
BLS_useful_71.loc[BLS_useful_71['A_MEAN'] == '*', :]

We filter the occupations and industries we need from the entire dataset, for the 71 industry case

In [None]:
for (size, df) in [('71', BLS_useful_71), ('389', BLS_useful_389)]:
    df = df[~df.duplicated(keep='first')]

    # remove data if total employment is not known (i.e. we assume it to be 0)
    df = df[df['TOT_EMP'].astype(str).str.isnumeric()]
    df['TOT_EMP'] = df['TOT_EMP'].astype(int)

    # BLS_useful[~BLS_useful['A_MEAN'].astype(str).str.isnumeric()]

    ind_occ_crosswalk = pd.crosstab(index = df['NAICS'], columns = df['OCC_CODE'], values = df['TOT_EMP'], aggfunc='sum').fillna(0)
    # note: do not use this for BEA matching directly, as the industries don't match exactly (see below)
    ind_occ_crosswalk.to_csv(data_out + f'2018_{size}_ind_occ_tot_emp.csv')

    # This one is fine, as it only looks at relative employment figures:
    ind_occ_crosswalk.div(ind_occ_crosswalk.sum(axis=1), axis=0).to_csv(data_out + f'2018_{size}_ind_occ_tot_emp_relative_ind.csv')

    df = df[df['A_MEAN'].astype(str).str.isnumeric()]
    df['A_MEAN'] = df['A_MEAN'].astype(int)

    # Wage matrix
    annual_mean_wage = pd.crosstab(index = df['NAICS'], columns = df['OCC_CODE'],\
            values = df['A_MEAN'], aggfunc='sum').fillna(0)

    #annual_mean_wage = annual_mean_wage.apply(pd.to_numeric, errors='coerce') # missing values as NaN
    # impute average wage
    weighted_average_wage = annual_mean_wage.mul(ind_occ_crosswalk).sum() / ind_occ_crosswalk.sum()
    annual_mean_wage = annual_mean_wage.fillna(weighted_average_wage)

    annual_mean_wage.to_csv(data_out + f'2018_{size}_ind_occ_avg_wage.csv')

    #total_wage
    annual_mean_wage.mul(ind_occ_crosswalk).fillna(0.0).to_csv(data_out + f'2018_{size}_ind_occ_tot_wage.csv')

In [None]:
# also save with prse
df = BLS_useful_71.copy()
df = df[~df.duplicated(keep='first')]

# remove data if total employment is not known (i.e. we assume it to be 0)
df = df[df['TOT_EMP'].astype(str).str.isnumeric()]
df['TOT_EMP'] = df['TOT_EMP'].astype(int)

prse_crosswalk = pd.crosstab(index = df['NAICS'], columns = df['OCC_CODE'], values = df['EMP_PRSE'], aggfunc='sum').fillna(0)
# note: do not use this for BEA matching directly, as the industries don't match exactly (see below)
prse_crosswalk.to_csv(data_out + '2018_71_ind_occ_tot_emp_prse.csv')

In [None]:
# aggfunc does not make a difference
a = (pd.crosstab(index = df['NAICS'], columns = df['OCC_CODE'], values = df['EMP_PRSE'], aggfunc='mean').fillna(0) - \
    pd.crosstab(index = df['NAICS'], columns = df['OCC_CODE'], values = df['EMP_PRSE'], aggfunc='sum').fillna(0)).\
    stack()
a[a>0]