# Load libraries and functions

In [1]:
with open('libraries.py') as f:
    code = f.read()
exec(code)

with open('functions/functions.py') as f:
    code = f.read()
exec(code)

### Read DAG creation functions

In [2]:
with open('functions/manual_DAG_function.py') as f:
    code = f.read()
exec(code)

with open('functions/naive_DAG_function.py') as f:
    code = f.read()
exec(code)

with open('functions/firstLastTask_DAG_function.py') as f:
    code = f.read()
exec(code)

with open('functions/partitioned_DAG_function.py') as f:
    code = f.read()
exec(code)

with open('functions/condition_DAG_function.py') as f:
    code = f.read()
exec(code)

In [3]:
def get_tasks(onet_data_path,
              occupation_code):

    # Load the data
    onet = pd.read_csv(onet_data_path)
    onet = onet.sort_values(by=['year', 'occ_code', 'occ_title', 'task_id'])
    onet = onet[onet['year'] == 2023].reset_index(drop=True)

    # Get list of tasks
    my_df = onet[(onet.occ_code == f'{occupation_code}') & (onet.year == 2023)]
    tasks = my_df['task'].unique().tolist()
    tasks = [task.replace("'", "") for task in tasks] # remove apastrophes
    return tasks

## Main Code

In [4]:
# determine user
user = getpass.getuser()
if user == 'peymansh':
    main_folder_path = '/Users/peymansh/Dropbox (MIT)/Research/AI and Occupations/ai-exposure'
    data_path = f'{main_folder_path}/output'

### Run DAGs

In [5]:
onet_data_path = f'{data_path}/data/onet_occupations_yearly.csv'

# list of occupations to create DAGs for
occupation_list = ['travelAgents', 'insuranceUnderwriters', 'pileDriverOperators', 
                   'dredgeOperators', 'gradersAndSortersForAgriculturalProducts', 'reinforcingIronAndRebarWorkers',
                   'insuranceAppraisersForAutoDamage', 'floorSandersAndFinishers', 'dataEntryKeyer', 
                   'athletesAndSportsCompetitors', 'audiovisualEquipmentInstallerAndRepairers', 'hearingAidSpecialists', 
                   'personalCareAides', 'proofreadersAndCopyMarkers', 'chiropractors', 
                   'shippingReceivingAndInventoryClerks', 'cooksShortOrder', 'orthodontists',
                   'subwayAndStreetcarOperators', 'packersAndPackagersHand', 'hoistAndWinchOperators', 
                   'forgingMachineSettersOperatorsAndTenders', 'avionicsTechnicians', 'dishwashers', 
                   'dispatchersExceptPoliceFireAndAmbulance', 'familyMedicinePhysicians', 'MachineFeedersAndOffbearers'
                   ]

occupation_list = ['travelAgents', 'insuranceUnderwriters', 'pileDriverOperators', 
                   'dredgeOperators', 'gradersAndSortersForAgriculturalProducts', 'reinforcingIronAndRebarWorkers',
                   'insuranceAppraisersForAutoDamage', 'floorSandersAndFinishers', 'dataEntryKeyer', 
                   'athletesAndSportsCompetitors', 'audiovisualEquipmentInstallerAndRepairers', 'hearingAidSpecialists', 
                   ]
# occupation_list = ['packersAndPackagersHand', 'hoistAndWinchOperators', 
#                    'forgingMachineSettersOperatorsAndTenders', 'dishwashers', 
#                    'dispatchersExceptPoliceFireAndAmbulance', 'familyMedicinePhysicians', 'MachineFeedersAndOffbearers'
#                    ]

# occupation_list = ['dishwashers', 
#                    'dispatchersExceptPoliceFireAndAmbulance', 'familyMedicinePhysicians', 'MachineFeedersAndOffbearers'
#                    ]

# occupation_list = ['travelAgents', 'insuranceUnderwriters', 'pileDriverOperators', 
#                    ]

In [6]:
import time
start_time = time.time()

for occupation in occupation_list:
    occupation_start_time = time.time()
    print(f'--------------- Running: {occupation} ---------------')

    # Generate occupation-specific strings
    GPT_input_occupation, plot_title_occupation, occupation_code, occupation_folder = pick_occupation(occupation)

    # Get occupation tasks
    tasks = get_tasks(onet_data_path, occupation_code)
    print(f'Number of tasks: {len(tasks)}')

    # 1) Manual DAG
    if occupation in ['travelAgents', 'insuranceUnderwriters', 'pileDriverOperators']:
        create_manual_DAG(occupation,
                          input_filename = f'{occupation_folder}/{occupation}_AM.csv',
                          output_filename = f'{occupation_folder}/{occupation}_M_DAG_df.csv')
        
    # 2) Naive DAG(s)
    print('\n*** Naive DAG ***')
    naive_DAG(GPT_input_occupation,
              tasks,
              lastTask_output_filename = f'{occupation_folder}/{occupation}_N_lastTasks.csv',
              output_DAG_filename_naive = f'{occupation_folder}/{occupation}_N_GPT_DAG_df.csv',
              output_DAG_filename_naiveTwoStep = f'{occupation_folder}/{occupation}_N2_GPT_DAG_df.csv',
              conditioned_DAG_output_filename = f'{occupation_folder}/{occupation}_CN_GPT_DAG_df.csv')
    
    # 3) First-last-task DAG
    print('\n*** First-last-task DAG ***')
    firstLastTask_DAG(GPT_input_occupation,
                      tasks,
                      firstLastTask_output_filename = f'{occupation_folder}/{occupation}_FLT_tasks.csv',
                      firstLastTask_DAG_output_filename = f'{occupation_folder}/{occupation}_FLT_GPT_DAG_df.csv',
                      conditioned_DAG_output_filename = f'{occupation_folder}/{occupation}_CFLT_GPT_DAG_df.csv')
    
    # 4) Partitioned DAG
    print('\n*** Partitioned DAG ***')
    partitioned_DAG(GPT_input_occupation,
                    tasks,
                    lastTask_output_filename = f'{occupation_folder}/{occupation}_P_lastTasks.csv',
                    partitions_output_filename = f'{occupation_folder}/{occupation}_P_partitions.csv',
                    partitioned_DAG_output_filename = f'{occupation_folder}/{occupation}_P_GPT_DAG_df.csv',
                    conditioned_partitioned_DAG_output_filename = f'{occupation_folder}/{occupation}_CP_GPT_DAG_df.csv')
    
    occupation_end_time = time.time()
    occupation_execution_time = occupation_end_time - occupation_start_time
    print(f"\n******** {occupation} runtime: {occupation_execution_time:.2f} seconds ********\n")

end_time = time.time()
execution_time = (end_time - start_time)/60
print(f"\n\nTotal Runtime: {execution_time:.2f} minutes")

--------------- Running: travelAgents ---------------
Number of tasks: 8

*** Naive DAG ***
Count of triangles: 26
Number of quadrilaterals: 1
Number of AC-DC edges to remove: 0
Number of AC edges to remove: 12
Total number of edges to remove: 12

*** First-last-task DAG ***
Count of triangles: 35
Number of quadrilaterals: 14
Number of AC-DC edges to remove: 0
Number of AC edges to remove: 10
Total number of edges to remove: 10

*** Partitioned DAG ***
Count of triangles: 43
Number of quadrilaterals: 2
Number of AC-DC edges to remove: 0
Number of AC edges to remove: 22
Total number of edges to remove: 22

******** travelAgents runtime: 32.92 seconds ********

--------------- Running: insuranceUnderwriters ---------------
Number of tasks: 7

*** Naive DAG ***
Count of triangles: 25
Number of quadrilaterals: 2
Number of AC-DC edges to remove: 0
Number of AC edges to remove: 16
Total number of edges to remove: 16

*** First-last-task DAG ***
Count of triangles: 26
Number of quadrilaterals

TypeError: cannot use a string pattern on a bytes-like object