In [None]:
import pandas as pd
import numpy as np

In [None]:
data = '../data/'
data_out = '../results/data_out/'
fig_folder = '../results/figs/'

# This notebook will combine the IPUMS OMN, BLS occupations-industries and the BEA's IO into one network

The supra-adjacency matrix will have 4 blocks A, B, C1, and C2, all of which are adjacency matrices in their own right:

In [None]:
print(pd.DataFrame(np.matrix([['A', 'C1'], ['C2', 'B']])))

In our case, A is the I-O matrix of industries (from BEA), B is the occupational mobility network (from IPUMS), and C1 and C2 are bipartite networks of occupations and industries (from BLS). We have already defined all of these networks in earlier notebooks. In this notebook, we stitch them all together. First by making sure the indices and columns are the same by forcing the BLS codes into BEA and IPUMS codes. Then we normalise to the correct numbers. C1.T does not necessarily equal C2. For example, C1 can be the fraction of workers from that occupation working in an industry, and C2 can be the fraction of works from an industry working in a particular occupation.

## 1) industries

1. Read in BEA IO
2. Read in BLS industries
3. Match them using a crosswalk and use additional BEA data to split and merge BLS data correctly into BEA industries

In [None]:
# Read IO: Matrix Z
Z_2018 = pd.read_excel(data + 'Data_out_IO/2018-Z_sum.xlsx', index_col=0)

In [None]:
# Read BLS industries
tot_emp_2018_sum = pd.read_csv(data_out + '2018_71_ind_occ_tot_emp.csv', index_col=0)
tot_wage_2018_sum = pd.read_csv(data_out + '2018_71_ind_occ_tot_wage.csv', index_col=0)

In [None]:
# Read crosswalk
crosswalk_71_BLS = pd.read_csv(data + 'Data_out_BLS_NAICS/BEA_71_BLS_NAICS_proportional_ownership.csv', index_col=0)

In [None]:
print((tot_emp_2018_sum.index.astype(str) != crosswalk_71_BLS.columns).sum())
print((tot_wage_2018_sum.index.astype(str) != crosswalk_71_BLS.columns).sum())

In [None]:
# matrix multiplication matches occupation numbers on BEA industries: total employees per occupation per sector
BEA_occ_71 = crosswalk_71_BLS.dot(tot_emp_2018_sum.to_numpy())
BEA_occ_71.columns = tot_emp_2018_sum.columns

In [None]:
# Total wage per occupation per sector
BEA_wage_71 = crosswalk_71_BLS.dot(tot_wage_2018_sum.to_numpy())
BEA_wage_71.columns = tot_wage_2018_sum.columns

Now we have BLS occupations matched on BEA industries, and the total amount of employees and total wage figures per occupation per industry: Matrix A done, and C1 and C2 partially (still need to convert from BLS industries to IPUMS)

### 1.1) We can now save the bi-partite graphs BEA-BLS

In [None]:
BEA_occ_71.to_csv(data_out + "bi-partite_emp_sum.csv")
BEA_wage_71.to_csv(data_out + "bi-partite_wage_sum.csv")

In [None]:
# and variance
tot_emp_2018_sum_prse = pd.read_csv(data_out + '2018_71_ind_occ_tot_emp_prse.csv', index_col=0)

#stddev = prse * x_mean / 100
#variance = stddev^2
var = (tot_emp_2018_sum.multiply(tot_emp_2018_sum_prse) / 100)**2

# variance is additive
BEA_var = crosswalk_71_BLS.dot(var)

In [None]:
BEA_var.to_csv(data_out + "bi-partite_emp_sum_var.csv")

## 2) Occupations

1. Read in IPUMS OMN
2. Read in IPUMS/BLS crosswalk (with merge/split fractions)
3. convert BLS occupations into IPUMS

In [None]:
# Read OMN
omn = pd.read_csv(data + 'asec_10_19_avg.csv', header=None)
names = pd.read_csv(data + 'occ_names_class_asec_soc_map.csv', index_col=0)
omn.index = names.Code
omn.columns = names.Code

#omn_rel = omn.div(omn.sum(axis=1), axis=0)
#assert(np.allclose(omn_rel.sum(axis=1), np.ones(omn_rel.shape[0])))

In [None]:
# Read crosswalk (with correct merge/split fractions)
bls_to_omn = pd.read_csv(data_out + "2018_BLS_IPUMS_crosswalk_proportional.csv", index_col=0)
bls_to_omn.columns = bls_to_omn.columns.astype(float).astype(int).astype(str)

In [None]:
# Convert (merge/split) BLS occupations into IPUMS
C1_emp_sum = BEA_occ_71.dot(bls_to_omn.reindex(BEA_occ_71.columns))
C1_wage_sum = BEA_wage_71.dot(bls_to_omn.reindex(BEA_occ_71.columns))

### 2.2) We can now save the bi-partite graphs BEA-IPUMS

We have matrix B (df_omn), as well as two versions of matrix C1 (total employment and total wage)

In [None]:
C1_emp_sum.to_csv(data_out + "bi-partite_emp_sum_IPUMS.csv")
C1_wage_sum.to_csv(data_out + "bi-partite_wage_sum_IPUMS.csv")

# We do everything in BLS occupations. Comment out this cell if we require data in IPUMS/ASEC codes

In [None]:
C1_emp_sum = BEA_occ_71
C1_wage_sum = BEA_wage_71

### 2.2) Change total wage to total employment compensation

In [None]:
# read in labour compensation
labour_comp_sum = pd.read_excel(data + "AllTablesIO/IOUse_After_Redefinitions_PRO_1997-2019_Summary.xlsx", 
                      sheet_name = '2018', header = 5, skiprows = range(6, 83), nrows = 1, 
                      index_col = 0, usecols = "B:BU", engine='openpyxl')

In [None]:
# scale up wages to labour compensation, so it fits with IO numbers
labour_comp_sum = labour_comp_sum.T.merge(C1_wage_sum.reindex(labour_comp_sum.columns).sum(axis=1).rename('wagesum') / 1000000, left_index=True, right_index=True)
labour_comp_sum.plot.bar(figsize=(18, 5))
labour_comp_sum['factor'] = (labour_comp_sum['Compensation of employees'] / labour_comp_sum.wagesum) / 1000000

In [None]:
C1_comp_sum = C1_wage_sum.mul(labour_comp_sum.reindex(C1_wage_sum.index).factor, axis=0).fillna(0)

In [None]:
labour_comp_sum

## 3) Merge industries and occupations into one supra-adjacency matrix and normalise correctly

1. Find correct submatrices
2. Prepare different supra-adjacency matrices as laid out in document by correct normalisation

In [None]:
# combine submatrices pieces into one matrix
Z_2018.append(C1_comp_sum.T).fillna(0).to_csv(data_out + "IO_occupations_2018_sum.csv")

In [None]:
# IO with value added and final consumption and occupations
va = pd.read_excel(data + "Data_out_IO/2018-va_sum.xlsx", index_col=0)
fc = pd.read_excel(data + "Data_out_IO/2018-f_sum.xlsx", index_col=0)

# only keep the labour compensation value added for where we don't have any occupational data (e.g. agriculture + military)
va.loc['V001'] = (va.loc['V001'] - round(C1_comp_sum.sum(axis=1))).fillna(va.loc['V001'])

full_IO_sum = pd.concat([Z_2018.append(C1_comp_sum.T).fillna(0).append(va), fc], axis=1)
# make sure the index codes do not overlap
full_IO_sum[full_IO_sum.index.duplicated(keep=False)]

In [None]:
full_IO_sum.to_csv(data_out + "full_IO_occupations_2018_sum.csv")

In [None]:
# employment per 1 million output
A_emp_sum = C1_emp_sum.T.div(full_IO_sum.iloc[:,:71].sum(axis=0))
A_emp_sum.to_csv(data_out + "emp_per_million_output_sum_2018_no_elec_split.csv")