#### By: Peyman Shahidi
#### Created: Oct 29, 2025
#### Last Edit: Nov 3, 2025

<br>

In [1]:
#Python
import getpass
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
import random 

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [2]:
main_folder_path = ".."
input_data_path = f"{main_folder_path}/data"
output_data_path = f'{input_data_path}/computed_objects'
output_plot_path = f"{main_folder_path}/writeup/plots"

In [3]:
# Create directories if they don't exist
import os

for path in [output_data_path, output_plot_path]:
    if not os.path.exists(path):
        os.makedirs(path)

## Create Employment Shares from BLS

In [4]:
bls_orig_df = pd.read_excel(f"{main_folder_path}/data/oesm23all/all_data_M_2023.xlsx")

In [5]:
# Get a copy of the original dataframe to work with
bls_df = bls_orig_df.copy()

# Keep only national-level data
bls_df = bls_df[bls_df.AREA == 99]

# Drop rows whose TOT_EMP and PCT_TOTAL are both NaN
print('Length before dropping TOT_EMP == "**":', len(bls_df))
bls_df = bls_df[bls_df.TOT_EMP != '**']
print('Length After dropping TOT_EMP == "**":', len(bls_df))

# Get sector shares and drop aggregate occupation codes
sector_total_shares_df = bls_df[bls_df.OCC_CODE == '00-0000'][['NAICS', 'TOT_EMP']]
sector_total_shares_df = sector_total_shares_df.rename(columns={'TOT_EMP': 'totalSectorEmp'})
sector_total_shares_df['sectorEmpShare'] = sector_total_shares_df['totalSectorEmp'] / sector_total_shares_df['totalSectorEmp'].sum()

# Drop aggregate occupation codes
bls_df = bls_df[bls_df.OCC_CODE != '00-0000']

# Keep only relevant columns
cols = ['NAICS', 'NAICS_TITLE', 'I_GROUP', 'OCC_CODE', 'OCC_TITLE', 'O_GROUP', 'TOT_EMP']
bls_df = bls_df[cols]

Length before dropping TOT_EMP == "**": 177501
Length After dropping TOT_EMP == "**": 169536


In [6]:
# Specify sector and ONET levels of interest
bls_sector_levels = ['sector', '3-digit', '4-digit', '5-digit', '6-digit']
onet_levels = ['major', 'minor', 'broad', 'detailed']

for my_sector in bls_sector_levels:
    for my_onet_level in onet_levels:
        # Subset to BLS industry and O*NET level of interest
        sector_df = bls_df[(bls_df['I_GROUP'] == my_sector) & (bls_df['O_GROUP'] == my_onet_level)].copy()
        sector_df = sector_df.drop_duplicates()

        # Clean TOT_EMP: coerce to numeric, set non-integer and NaN values to 0
        sector_df['TOT_EMP'] = pd.to_numeric(sector_df['TOT_EMP'], errors='coerce')
        # Identify non-integer entries (fractional values) and set them to 0
        non_integer_mask = sector_df['TOT_EMP'].notna() & (sector_df['TOT_EMP'] % 1 != 0)
        if non_integer_mask.any():
            num_non_int = non_integer_mask.sum()
            print(f"Setting {num_non_int} non-integer TOT_EMP values to 0")
            sector_df.loc[non_integer_mask, 'TOT_EMP'] = 0

        # Replace remaining NaN with 0 and cast to integer
        sector_df['TOT_EMP'] = sector_df['TOT_EMP'].fillna(0).astype(int)

        # percentiles to compute
        pct = [0.01, 1, 5, 10, 25, 50, 75, 90, 99, 99.9, 99.99]

        # prepare the series: coerce to numeric and drop NaNs
        vals = pd.to_numeric(sector_df[sector_df['TOT_EMP'] > 0]['TOT_EMP'], errors='coerce').dropna()

        if len(vals) == 0:
            print("No numeric TOT_EMP values found in sector_df.")
        else:
            # Using pandas quantile (q expects [0-1])
            qs = [p / 100.0 for p in pct]
            pct_values = vals.quantile(q=qs)

            # Print nicely
            # for p, v in zip(pct, pct_values):
            #     print(f"{p}th percentile: {v:,.0f}")

            # Optional: save to CSV (per-sector file)
            out = pd.DataFrame({
                'percentile': pct,
                'TOT_EMP': pct_values.values
            })
            out_dir = f'{output_data_path}/BLS_ONET_empShares/percentiles'
            os.makedirs(out_dir, exist_ok=True)
            out_file = os.path.join(out_dir, f'TOTEMP_pctle_BLS{my_sector}_ONET{my_onet_level}.csv')
            out.to_csv(out_file, index=False)
            # print(f"Saved percentiles to {out_file}")

        # Compute employment shares
        # 1) add total sector employment and sector shares
        sector_df = sector_df.merge(sector_total_shares_df, on='NAICS', how='left')

        # 2) occ_sectorEmpShare: share of employment within the NAICS / NAICS_TITLE group
        group_cols = ['NAICS', 'NAICS_TITLE']
        group_total = sector_df.groupby(group_cols)['totalSectorEmp'].transform('first')
        sector_df['occ_sectorEmpShare'] = (sector_df['TOT_EMP'] / group_total).fillna(0)

        # 3) occ_totalEmpShare: share of employment across the entire economy
        total_emp = sector_total_shares_df[['NAICS', 'totalSectorEmp']].drop_duplicates().sum()['totalSectorEmp']
        sector_df['occ_totalEmpShare'] = sector_df['TOT_EMP'] / total_emp

        # Save sector_df for further analysis (overwrites previous file)
        out_dir = f'{output_data_path}/BLS_ONET_empShares'
        os.makedirs(out_dir, exist_ok=True)
        sector_df.to_csv(f'{out_dir}/BLS{my_sector}_ONET{my_onet_level}_empShares.csv', index=False)
        print(f"✓ {my_sector}-level BLS data at {my_onet_level} ONET Level saved with {sector_df.shape[0]:,} rows")

        # # Calculate counts per occupation title grouped by NAICS, I_GROUP, OCC_TITLE, OCC_CODE
        # # - row_count: total rows in sector_df for the group
        # # - unique_OCC_CODE: number of unique OCC_CODE per (NAICS, I_GROUP, OCC_TITLE) (if OCC_CODE exists)

        # required_cols = ['NAICS', 'OCC_TITLE', 'OCC_CODE']

        # # Count rows per exact group (OCC_TITLE, OCC_CODE)
        # counts_df = sector_df.groupby(['NAICS', 'OCC_TITLE', 'OCC_CODE']).size().rename('row_count').reset_index()

        # # Sort by row_count descending
        # counts_df = counts_df.sort_values(by=['row_count'], ascending=False).reset_index(drop=True)

        # # Sanity check -- max row count must be one
        # max_unique_occ_codes = counts_df['row_count'].max()
        # print(f"Max unique OCC_CODE per (NAICS, OCC_TITLE): {max_unique_occ_codes}")

        # # Calculate NAICS per occupation code
        # naics_per_occ_code = sector_df.groupby(['OCC_TITLE', 'OCC_CODE'])['NAICS'].nunique().reset_index()
        # naics_per_occ_code = naics_per_occ_code.rename(columns={'NAICS': 'num_unique_NAICS_per_OCC_CODE'})
        # naics_per_occ_code = naics_per_occ_code.sort_values(by=['num_unique_NAICS_per_OCC_CODE', 'OCC_CODE'], ascending=[False, True]).reset_index(drop=True)

        # # Save the NAICS per OCC_CODE data
        # # naics_per_occ_code.to_csv(f'{output_data_path}/naics_per_occ_code_counts.csv', index=False)
        # print(f"✓ NAICS per OCC_CODE data saved with {naics_per_occ_code.shape[0]:,} rows")
        # display(naics_per_occ_code.head()

✓ sector-level BLS data at major ONET Level saved with 424 rows
✓ sector-level BLS data at minor ONET Level saved with 1,597 rows
✓ sector-level BLS data at broad ONET Level saved with 5,504 rows
✓ sector-level BLS data at detailed ONET Level saved with 8,153 rows
✓ 3-digit-level BLS data at major ONET Level saved with 1,783 rows
✓ 3-digit-level BLS data at minor ONET Level saved with 5,607 rows
✓ 3-digit-level BLS data at broad ONET Level saved with 15,774 rows
✓ 3-digit-level BLS data at detailed ONET Level saved with 21,770 rows
✓ 4-digit-level BLS data at major ONET Level saved with 4,580 rows
✓ 4-digit-level BLS data at minor ONET Level saved with 13,512 rows
✓ 4-digit-level BLS data at broad ONET Level saved with 33,156 rows
✓ 4-digit-level BLS data at detailed ONET Level saved with 43,562 rows
✓ 5-digit-level BLS data at major ONET Level saved with 805 rows
✓ 5-digit-level BLS data at minor ONET Level saved with 2,133 rows
✓ 5-digit-level BLS data at broad ONET Level saved with 

### Reshuffle TOT_EMP Values 1000 Times to Randomize Weights at (sector-detailed) Level

In [7]:
my_sector = 'sector' 
my_onet_level = 'detailed' 
sector_df = bls_df[(bls_df['I_GROUP'] == my_sector) & (bls_df['O_GROUP'] == my_onet_level)].copy()
sector_df = sector_df.drop_duplicates()

# Set variables
iterations = 1000

output_dir = f'{output_data_path}/BLS_ONET_empShares/BLS{my_sector}_ONET{my_onet_level}_reshuffledWeights'
os.makedirs(output_dir, exist_ok=True)

results = []
for i in range(iterations):
    output_path = f"{output_dir}/BLS{my_sector}_ONET{my_onet_level}_empShares_iter{i+1}.csv"
    if os.path.exists(output_path):
        continue
    
    if i % 50 == 0:
        print(f"Reshuffling iteration {i}")

    # shuffle TOT_EMP values
    shuffled = np.random.permutation(sector_df['TOT_EMP'].values)
    
    # Make a copy with shuffled values
    shuffled_df = sector_df.copy()
    shuffled_df['TOT_EMP'] = shuffled

    # Save reshuffled data
    shuffled_df.to_csv(output_path, index=False)

Reshuffling iteration 0...
Reshuffling iteration 50...
Reshuffling iteration 100...
Reshuffling iteration 150...
Reshuffling iteration 200...
Reshuffling iteration 250...
Reshuffling iteration 300...
Reshuffling iteration 350...
Reshuffling iteration 400...
Reshuffling iteration 450...
Reshuffling iteration 500...
Reshuffling iteration 550...
Reshuffling iteration 600...
Reshuffling iteration 650...
Reshuffling iteration 700...
Reshuffling iteration 750...
Reshuffling iteration 800...
Reshuffling iteration 850...
Reshuffling iteration 900...
Reshuffling iteration 950...
