In [55]:
with open('libraries.py') as f:
    code = f.read()
exec(code)

with open('functions.py') as f:
    code = f.read()
exec(code)

In [56]:
# determine user
user = getpass.getuser()
if user == 'peymansh':
    main_folder_path = '/Users/peymansh/Dropbox (MIT)/Research/AI and Occupations/ai-exposure'
    data_path = f'{main_folder_path}/output'

In [57]:
def get_tasks(onet_data_path,
              occupation_code):

    # Load the data
    onet = pd.read_csv(onet_data_path)
    onet = onet.sort_values(by=['year', 'occ_code', 'occ_title', 'task_id'])
    onet = onet[onet['year'] == 2023].reset_index(drop=True)

    # Get list of tasks
    my_df = onet[(onet.occ_code == f'{occupation_code}') & (onet.year == 2023)]
    tasks = my_df['task'].unique().tolist()
    return tasks

### Generate all possible partition schemes for the set of tasks (ignoring structre of the DAG)

In [58]:
from itertools import combinations

def partitions(set_):
    if not set_:
        yield []
        return
    for i in range(1, len(set_) + 1):
        for part in combinations(set_, i):
            remaining = set(set_) - set(part)
            if not remaining:
                yield [list(part)]
            else:
                for b in partitions(list(remaining)):
                    yield [list(part)] + b

def generate_unique_partitions(numbers):
    all_partitions = set()
    for partition in partitions(numbers):
        # Create a frozenset of frozensets to make each partition hashable and order-independent
        partition_set = frozenset(frozenset(part) for part in partition)
        all_partitions.add(partition_set)
    
    # Convert the frozensets back to lists for the final output
    unique_partitions = [list(map(list, partition)) for partition in all_partitions]

    # Sort elements
    unique_partitions = sorted([sorted(x) for x in unique_partitions], key=len)
    return unique_partitions

### Check if partition scheme is "valid" (i.e., if its non-singleton partitions are a connected graph)

In [59]:
def is_connected(matrix):
    # Number of nodes in the matrix
    num_nodes = matrix.shape[0]
    
    # Visited array to keep track of visited nodes
    visited = np.zeros(num_nodes, dtype=bool)
    
    # Helper function to perform DFS
    def dfs(node):
        visited[node] = True
        # Visit all the neighbors of the current node
        for neighbor in range(num_nodes):
            if matrix[node, neighbor] == 1 and not visited[neighbor]:
                dfs(neighbor)
            elif matrix[neighbor, node] == 1 and not visited[neighbor]:
                dfs(neighbor)
    
    # Start DFS from the first node (node 0)
    dfs(0)
    
    # If all nodes are visited, the matrix is connected
    return np.all(visited)


def validate_partition_using_connectedness(adjacency_matrix, tasks_list):
    # Return valid if Singleton
    if len(tasks_list) == 1:
        return True
    # Check if partition forms connected graph
    else:
        # Subset original adjacency matrix
        subset_matrix = adjacency_matrix[np.ix_(tasks_list, tasks_list)]

        # check if subset matrix is a connected graph
        subset_matrix_connected = is_connected(subset_matrix)

        # return true if connected and false otherwise
        return subset_matrix_connected

In [60]:
def get_partition_boundary(adjacency_matrix, partition):
    # create a matrix whose columns are nodes not in the partition and whose rows are nodes in the partition
    # (subset adjacency matrix to outgoing edges of partition nodes --i.e., rows-- and incoming edges of non-partition nodes --i.e., columns.)
    reduced_matrix = np.delete(adjacency_matrix, partition, axis=1) 
    reduced_matrix = reduced_matrix[partition, :]

    # find nodes in partition w/ an edge to non-partition nodes
    partition_boundary_tasks = [i for i in partition if np.any(reduced_matrix[partition.index(i), :])]

    return partition_boundary_tasks


def compute_plan_cost(adjacency_matrix, M_dict, A_dict, D_dict, AI_quality, execution_plan, human_tasks):
    # initialize costs
    total_cost = 0
    labor_cost = 0
    management_cost = 0

    for partition in execution_plan:
        if len(partition) == 1:
            if partition[0] in human_tasks:
                partition_cost = sum(M_dict[key] for key in partition)
                labor_cost += partition_cost
            else:
                AI_cost = sum(A_dict[key] for key in partition)
                difficulty = sum(D_dict[key] for key in partition)
                partition_cost = AI_cost * (AI_quality ** (-1 * difficulty))
                management_cost += partition_cost
        else:
            # calculate automated-chain management cost
            partition_boundary_tasks = get_partition_boundary(adjacency_matrix, partition)
            AI_cost = sum(A_dict[key] for key in partition_boundary_tasks)
            difficulty = sum(D_dict[key] for key in partition)
            partition_cost = AI_cost * (AI_quality ** (-1 * difficulty))
            management_cost += partition_cost
        
        total_cost += partition_cost

    return total_cost, labor_cost, management_cost

### Combine steps into a function to run a for loop over

In [61]:
def DAG_indiffCurve(input_path, output_path, unique_partitions, alpha_list):
    # read DAG
    dag_df = pd.read_csv(input_path)

    # remove edges if comment column labeled with "TriangleRemovedFlag" (edge is there for plotting purposes and is not part of the actual DAG)
    if 'comment' in dag_df.columns:
        dag_df = dag_df[~dag_df['comment'].str.endswith('TriangleRemovedFlag')]

    # get task stats
    tasks_stats = pd.read_csv(f'{occupation_folder}/{occupation}_taskStats.csv')



    
    # extract list of tasks and create a dictionary for indexing tasks
    tasks_list = tasks_stats['task'].unique()
    tasks_dict = {i: node for i, node in enumerate(tasks_list, start=0)}

    # create numpy array of adjacency matrix
    adjacency_matrix = np.zeros((len(tasks_list), len(tasks_list)), dtype=int)
    aux_dict = {value: key for key, value in tasks_dict.items()}
    for _, row in dag_df.iterrows():
        source_index = aux_dict[row['source']]
        target_index = aux_dict[row['target']]
        adjacency_matrix[source_index, target_index] = 1



    
    # Define a break-even difficulty for base AI quality (alpha)
    # Above break-even difficulty threshold task is done manually
    # As AI quality (alpha) goes up break-even difficulty goes up
    # for index, alpha in enumerate(alpha_list):
    #     if index % np.floor(n/4) == np.floor(n/4) - 1:
    #         pretty_label = str(np.round(alpha,2)*100).split('.')[0]
    #         #tasks_stats[f'be_difficulty_{pretty_label}'] = np.log(tasks_stats['management_cost'] / tasks_stats['human_cost']) / np.log(alpha)


    # add task_dict key and reset index
    aux_dict = {value: key for key, value in tasks_dict.items()}
    tasks_stats['dict_index'] = tasks_stats.apply(lambda row: aux_dict[row.task], axis=1)
    tasks_stats = tasks_stats.sort_values(by='dict_index')
    tasks_stats = tasks_stats.set_index('dict_index', drop=False)
    tasks_stats.index.name = None




    # create dictionaries for human cost, management cost, and difficulty
    M_dict = dict(zip(tasks_stats['dict_index'], tasks_stats['human_cost']))
    A_dict = dict(zip(tasks_stats['dict_index'], tasks_stats['management_cost']))
    D_dict = dict(zip(tasks_stats['dict_index'], tasks_stats['difficulty']))



    # Get valid partitioning schemes
    valid_partitions = []
    for scheme in unique_partitions:
        # Set valid partitions count to 0
        valid_partition_count = 0
        for partition in scheme:
            valid_partition = validate_partition_using_connectedness(adjacency_matrix, partition)
            if valid_partition:
                valid_partition_count += 1
        
        # If number of valid partitions within a partition scheme is equal to 
        # number of partitions in partition scheme then partition scheme is valid
        if valid_partition_count == len(scheme):
            valid_partitions.append(scheme)

    # Print stats
    print(f'Number of valid partitioning schemes given DAG structure: {len(valid_partitions)}')


    


    # get how many "singleton" partitions there are in valid partition
    valid_execution_plans = pd.DataFrame()
    for my_valid_partition in valid_partitions:
        singleton_partitions = [lst[0] for lst in my_valid_partition if len(lst) == 1]
        #singleton_partitions = [lst for lst in my_valid_partition if len(lst) == 1]

        # get the power set of "singleton" partitions
        # goal is to generate ways singleton tasks can be done by human or AI
        all_combinations = [[]]
        for r in range(1, len(singleton_partitions) + 1):
            combinations_r = itertools.combinations(singleton_partitions, r)
            all_combinations.extend(combinations_r)

        # Convert the combinations to a list of lists (optional)
        all_combinations = [list(comb) for comb in all_combinations]
        all_combinations

        # repeat my_valid_partition for each combination in all_combinations to create a dataframe later
        my_valid_partition_repeated = [my_valid_partition for _ in range(len(all_combinations))]
        aux_df = pd.DataFrame({'execution_plan': my_valid_partition_repeated, 
                            'human_tasks': all_combinations})
        
        # append to valid_execution_plans
        valid_execution_plans = pd.concat([valid_execution_plans, aux_df], ignore_index=True)






    # calculate plan costs for each alpha
    execution_plan_costs_df = pd.DataFrame()
    for counter, alpha in enumerate(alpha_list):
        my_alpha_execution_plan_costs_df = pd.DataFrame()
        for execution_plan, human_tasks in zip(valid_execution_plans['execution_plan'], valid_execution_plans['human_tasks']):
            # calculate plan costs
            total_cost, labor_cost, management_cost = compute_plan_cost(adjacency_matrix, M_dict, A_dict, D_dict, alpha, execution_plan, human_tasks)

            # create a dataframe to store execution plan costs
            aux_df = pd.DataFrame({'alpha': [alpha], 
                                   'execution_plan': [execution_plan],
                                   'human_tasks': [human_tasks],
                                   'total_cost': [total_cost],
                                   'labor_cost': [labor_cost],
                                   'management_cost': [management_cost]})
            
            # append to execution_plan_costs_df
            my_alpha_execution_plan_costs_df = pd.concat([my_alpha_execution_plan_costs_df, aux_df], ignore_index=True)
        

        # find optimal execution plan
        my_alpha_execution_plan_costs_df['min_total_cost_flag'] = (my_alpha_execution_plan_costs_df['total_cost'] == my_alpha_execution_plan_costs_df['total_cost'].min())

        # append to master dataframe
        execution_plan_costs_df = pd.concat([execution_plan_costs_df, my_alpha_execution_plan_costs_df], ignore_index=True)
    execution_plan_costs_df.to_csv(output_path, index=False)

## Main Code

In [62]:
import time
start_time = time.time()

# set alpha as AI quality metric
epsilon = 1e-8
alpha_list = [epsilon, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1-epsilon]

onet_data_path = f'{data_path}/data/onet_occupations_yearly.csv'

occupation_list = ['pileDriverOperators', 'dredgeOperators', 'gradersAndSortersForAgriculturalProducts',
                   'insuranceUnderwriters', 'insuranceAppraisersForAutoDamage', 'floorSandersAndFinishers', 
                   'reinforcingIronAndRebarWorkers', 'travelAgents', 'dataEntryKeyer', 
                   'athletesAndSportsCompetitors', 'audiovisualEquipmentInstallerAndRepairers', 'hearingAidSpecialists', 
                   'personalCareAides', 'proofreadersAndCopyMarkers', 'chiropractors', 
                   'shippingReceivingAndInventoryClerks', 'cooksShortOrder', 'orthodontists',
                   'subwayAndStreetcarOperators', 'packersAndPackagersHand', 'hoistAndWinchOperators', 
                   'forgingMachineSettersOperatorsAndTenders', 'avionicsTechnicians', 'dishwashers', 
                   'dispatchersExceptPoliceFireAndAmbulance', 'familyMedicinePhysicians', 'MachineFeedersAndOffbearers'
                   ]

occupation_list = ['travelAgents', 'insuranceUnderwriters', 'pileDriverOperators'
                   ]



occupation_list = ['pileDriverOperators', 'dredgeOperators', 'gradersAndSortersForAgriculturalProducts',
                   'insuranceUnderwriters', 'insuranceAppraisersForAutoDamage', 'floorSandersAndFinishers', 
                   'reinforcingIronAndRebarWorkers', 'travelAgents', 'dataEntryKeyer', 
                   'athletesAndSportsCompetitors'
                   ]

# occupation_list = ['travelAgents']

In [63]:
num_tasks_current = 0
num_tasks_previous = 0
for occupation in occupation_list:
    print(f'\n---------------------- Running: {occupation} ----------------------')
    occupation_start_time = time.time()

    # generate occupation-specific strings
    GPT_input_occupation, plot_title_occupation, occupation_code, occupation_folder = pick_occupation(occupation)


    # Get occupation tasks to create all possible partitions
    tasks = get_tasks(onet_data_path, occupation_code)
    num_tasks_current = len(tasks)
    print(f'Number of non-target tasks: {num_tasks_current}')

    if num_tasks_current < 10:
        n = 1000
    else: 
        n = 100

    # if number of tasks in new occupation has increased generate new set of possible partitions
    if num_tasks_current != num_tasks_previous:
        unique_partitions_start_time = time.time()

        # Generate list of numbers for non-"Target" tasks in occupation
        tasks_list_numbers = list(range(num_tasks_current))

        # Generate all possible partitioning schemes
        unique_partitions = generate_unique_partitions(tasks_list_numbers)
        unique_partitions_end_time = time.time()

        unique_partitions_execution_time = unique_partitions_end_time - unique_partitions_start_time
        print(f'Time to generate all possible partition schemes: {unique_partitions_execution_time:.2f} seconds')
    
    # update num_tasks_previous for next iteration and print stats
    num_tasks_previous = num_tasks_current
    print(f'Number of all possible partitioning schemes: {len(unique_partitions)}')


    # Manual DAG
    M_input_path = f'{occupation_folder}/{occupation}_M_DAG_df.csv'
    M_output_path = f'{occupation_folder}/indiffCurves/{occupation}_indiffCurves_M.csv'

    # First Last Task DAG
    N_input_path = f'{occupation_folder}/{occupation}_N_GPT_DAG_df.csv'
    N_output_path = f'{occupation_folder}/indiffCurves/{occupation}_indiffCurves_N.csv'

    # First Last Task DAG
    CN_input_path = f'{occupation_folder}/{occupation}_CN_GPT_DAG_df.csv'
    CN_output_path = f'{occupation_folder}/indiffCurves/{occupation}_indiffCurves_CN.csv'

    # First Last Task DAG
    FLT_input_path = f'{occupation_folder}/{occupation}_FLT_GPT_DAG_df.csv'
    FLT_output_path = f'{occupation_folder}/indiffCurves/{occupation}_indiffCurves_FLT.csv'

    # Conditioned First Last Task DAG
    CFLT_input_path = f'{occupation_folder}/{occupation}_CFLT_GPT_DAG_df.csv'
    CFLT_output_path = f'{occupation_folder}/indiffCurves/{occupation}_indiffCurves_CFLT.csv'

    # Partitioned DAG
    P_input_path = f'{occupation_folder}/{occupation}_P_GPT_DAG_df.csv'
    P_output_path = f'{occupation_folder}/indiffCurves/{occupation}_indiffCurves_P.csv'

    # Conditioned Partitioned DAG
    CP_input_path = f'{occupation_folder}/{occupation}_CP_GPT_DAG_df.csv'
    CP_output_path = f'{occupation_folder}/indiffCurves/{occupation}_indiffCurves_CP.csv'
    


    # create list of all DAGs
    if occupation in ['travelAgents', 'insuranceUnderwriters', 'pileDriverOperators']:
        DAG_indicator_list = ['Manual DAG', 'Naive DAG', 'Conditioned Naive DAG', 'First-Last Task DAG', 'Conditioned First-Last Task DAG', 'Partitioned DAG', 'Conditioned Partitioned DAG']
        input_paths_list = [M_input_path, N_input_path, CN_input_path, FLT_input_path, CFLT_input_path, P_input_path, CP_input_path]
        output_paths_list = [M_output_path, N_output_path, CN_output_path, FLT_output_path, CFLT_output_path, P_output_path, CP_output_path]
    else:
        DAG_indicator_list = ['Naive DAG', 'Conditioned Naive DAG', 'First-Last Task DAG', 'Conditioned First-Last Task DAG', 'Partitioned DAG', 'Conditioned Partitioned DAG']
        input_paths_list = [N_input_path, CN_input_path, FLT_input_path, CFLT_input_path, P_input_path, CP_input_path]
        output_paths_list = [N_output_path, CN_output_path, FLT_output_path, CFLT_output_path, P_output_path, CP_output_path]


    for DAG_indicator, input_path, output_path in zip(DAG_indicator_list, input_paths_list, output_paths_list):
        print(f'\n-------Running: {occupation} - {DAG_indicator}-------')
        
        DAG_start_time = time.time()
        DAG_indiffCurve(input_path, output_path, unique_partitions, alpha_list)
        DAG_end_time = time.time()

        DAG_execution_time = DAG_end_time - DAG_start_time
        print(f"\n{occupation} {DAG_indicator} runtime: {DAG_execution_time:.2f} seconds")

    occupation_end_time = time.time()
    occupation_execution_time = (occupation_end_time - occupation_start_time)/60
    print(f"\n\n************* {occupation} runtime: {occupation_execution_time:.2f} minutes *************")
    runtime_since_start = (time.time() - start_time)/60
    print(f"\nruntime since start: {runtime_since_start:.2f} minutes\n")


end_time = time.time()
execution_time = (end_time - start_time)/60
print(f"\n\nTotal Runtime: {execution_time:.2f} minutes")


---------------------- Running: pileDriverOperators ----------------------
Number of non-target tasks: 5
Time to generate all possible partition schemes: 0.00 seconds
Number of all possible partitioning schemes: 52

-------Running: pileDriverOperators - Manual DAG-------
Number of valid partitioning schemes given DAG structure: 26

pileDriverOperators Manual DAG runtime: 0.37 seconds

-------Running: pileDriverOperators - Naive DAG-------
Number of valid partitioning schemes given DAG structure: 30

pileDriverOperators Naive DAG runtime: 0.35 seconds

-------Running: pileDriverOperators - Conditioned Naive DAG-------
Number of valid partitioning schemes given DAG structure: 26

pileDriverOperators Conditioned Naive DAG runtime: 0.31 seconds

-------Running: pileDriverOperators - First-Last Task DAG-------
Number of valid partitioning schemes given DAG structure: 30

pileDriverOperators First-Last Task DAG runtime: 0.34 seconds

-------Running: pileDriverOperators - Conditioned First-L