In [1]:
with open('libraries.py') as f:
    code = f.read()
exec(code)

with open('functions.py') as f:
    code = f.read()
exec(code)

In [2]:
import subprocess

# Run caffeinate in the background to prevent sleep
subprocess.Popen(['caffeinate'])

<Popen: returncode: None args: ['caffeinate']>

In [3]:
# determine user
user = getpass.getuser()
if user == 'peymanshahidi':
    main_folder_path = '/Users/peymanshahidi/Dropbox (MIT)/Research/AI and Occupations/ai-exposure'
    data_path = f'{main_folder_path}/output'

In [4]:
def get_tasks(onet_data_path,
              occupation_code):

    # Load the data
    onet = pd.read_csv(onet_data_path)
    onet = onet.sort_values(by=['year', 'occ_code', 'occ_title', 'task_id'])
    onet = onet[onet['year'] == 2023].reset_index(drop=True)

    # Get list of tasks
    my_df = onet[(onet.occ_code == f'{occupation_code}') & (onet.year == 2023)]
    tasks = my_df['task'].unique().tolist()
    return tasks

In [5]:
# Function to calculate all path lengths from a node to the target node using DFS
def dfs_count_paths(adj_matrix, current_node, target_node, path_length, path_lengths, visited):
    if current_node == target_node:
        path_lengths.append(path_length)
        return

    visited[current_node] = True
    for neighbor in range(len(adj_matrix)):
        if adj_matrix[current_node][neighbor] == 1 and not visited[neighbor]:
            dfs_count_paths(adj_matrix, neighbor, target_node, path_length + 1, path_lengths, visited)
    visited[current_node] = False

# Function to calculate all path lengths from all nodes to the target node
def calculate_all_path_lengths_to_target(adj_matrix):
    # Number of nodes in the DAG
    n = len(adj_matrix)
    target_node = n - 1  # Index of the Target node

    all_path_lengths = []

    for start_node in range(n):
        if start_node != target_node:
            path_lengths = []
            visited = [False] * n
            dfs_count_paths(adj_matrix, start_node, target_node, 0, path_lengths, visited)
            all_path_lengths.extend(path_lengths)

    return all_path_lengths

In [6]:
def sparsity_calculator(adjacency_matrix):
    # Number of nodes in the DAG
    n = adjacency_matrix.shape[0]

    # Calculate the number of edges
    number_of_edges = np.sum(adjacency_matrix)

    # Calculate the maximum possible number of edges in a directed graph
    max_possible_edges = n * (n - 1)

    # Calculate sparsity
    sparsity = 1 - (number_of_edges / max_possible_edges)

    return sparsity

In [7]:
def find_neighbors(adjacency_matrix):
    # Get the number of nodes (n) from the shape of the adjacency matrix
    n = adjacency_matrix.shape[0]
    
    # Initialize an empty dictionary to store the neighbors for each node
    neighbors = {i: [] for i in range(n)}
    
    # Loop through each entry in the adjacency matrix
    for i in range(n):
        for j in range(n):
            # If there's an edge from i to j or from j to i, add j to the neighbors of i
            if adjacency_matrix[i, j] == 1 or adjacency_matrix[j, i] == 1:
                if j not in neighbors[i]:  # Avoid duplicate neighbors
                    neighbors[i].append(j)
                if i not in neighbors[j]:  # Ensure symmetry in the undirected version
                    neighbors[j].append(i)
    
    return neighbors

In [8]:
def create_inactive_node_neighbor_subset_combinations(inactive_neighbors_valid_subsets_dict):
    # Step 1: Extract unique lists from the dictionary values
    all_lists = [list(set(item)) for sublist in inactive_neighbors_valid_subsets_dict.values() for item in sublist]
    print(f'Number of lists extracted: {len(all_lists)}')

    # Step 2: Create all combinations and directly add unique elements
    output_set = set()
    
    # Instead of recomputing length and duplicates, work with unique sets directly
    all_combinations = []
    
    for r in range(1, len(all_lists) + 1):
        for combo in itertools.combinations(all_lists, r):
            # Convert each combination to a flattened tuple of sorted unique elements
            flattened_combo = tuple(sorted(set(itertools.chain(*combo))))
            output_set.add(flattened_combo)  # Add to set to ensure uniqueness
    
    # Convert the set back to sorted list of lists and return the result
    output_list = [list(combo) for combo in output_set]
    
    return sorted(output_list, key=len)

In [9]:
def is_connected(matrix):
    # Number of nodes in the matrix
    num_nodes = matrix.shape[0]
    
    # Visited array to keep track of visited nodes
    visited = np.zeros(num_nodes, dtype=bool)
    
    # Helper function to perform DFS
    def dfs(node):
        visited[node] = True
        # Visit all the neighbors of the current node
        for neighbor in range(num_nodes):
            if matrix[node, neighbor] == 1 and not visited[neighbor]:
                dfs(neighbor)
            elif matrix[neighbor, node] == 1 and not visited[neighbor]:
                dfs(neighbor)
    
    # Start DFS from the first node (node 0)
    dfs(0)
    
    # If all nodes are visited, the matrix is connected
    return np.all(visited)


def validate_partition_using_connectedness(adjacency_matrix, tasks_list):
    # Return valid if Singleton
    if len(tasks_list) == 1:
        return True
    # Check if partition forms connected graph
    else:
        # Subset original adjacency matrix
        subset_matrix = adjacency_matrix[np.ix_(tasks_list, tasks_list)]

        # check if subset matrix is a connected graph
        subset_matrix_connected = is_connected(subset_matrix)

        # return true if connected and false otherwise
        return subset_matrix_connected

In [10]:
def get_partition_boundary(adjacency_matrix, partition):
    # create a matrix whose columns are nodes not in the partition and whose rows are nodes in the partition
    # (subset adjacency matrix to outgoing edges of partition nodes --i.e., rows-- and incoming edges of non-partition nodes --i.e., columns.)
    reduced_matrix = np.delete(adjacency_matrix, partition, axis=1) 
    reduced_matrix = reduced_matrix[partition, :]

    # find nodes in partition w/ an edge to non-partition nodes
    partition_boundary_tasks = [i for i in partition if np.any(reduced_matrix[partition.index(i), :])]

    return partition_boundary_tasks


def compute_plan_cost(adjacency_matrix,
                      execution_plan, 
                      human_labor_dict,
                      machine_labor_dict, machine_management_dict, 
                      management_difficulty_dict, completion_difficulty_dict,
                      AI_quality = 1e-8,
                      human_labor_wage = 100,
                      machine_management_wage = 100000,
                      machine_automation_wage = 1000):
    # initialize costs
    human_tasks_list = []
    managed_tasks_list = []
    automated_tasks_list = []

    total_cost = 0
    for partition in execution_plan:
        #print(f'Cost calculation partition: {partition}')
        if len(partition) == 1:
            human_labor_cost = sum(human_labor_dict[key] for key in partition)
            
            machine_management_cost = sum(machine_management_dict[key] for key in partition)
            management_difficulty = sum(management_difficulty_dict[key] for key in partition)
            management_cost = machine_management_cost * (AI_quality ** (-1 * management_difficulty))
            
            if human_labor_cost < management_cost:
                total_cost += human_labor_cost * human_labor_wage
                human_tasks_list.append(partition)
            if human_labor_cost >= management_cost:
                total_cost += management_cost * machine_management_wage
                managed_tasks_list.append(partition)
        else:
            # determine which tasks are automated and which tasks are managed
            managed_tasks = get_partition_boundary(adjacency_matrix, partition)
            automated_tasks = [task for task in partition if task not in managed_tasks]
            managed_tasks_list.append(managed_tasks)
            automated_tasks_list.append(automated_tasks)

            # calculate management cost of partition
            machine_management_cost = sum(machine_management_dict[key] for key in managed_tasks)
            management_difficulty = sum(management_difficulty_dict[key] for key in managed_tasks)
            management_cost = machine_management_cost * (AI_quality ** (-1 * management_difficulty))
            total_cost += management_cost * machine_management_wage

            # calculate labor cost of partition
            machine_automation_cost = sum(machine_labor_dict[key] for key in automated_tasks)
            completion_difficulty = sum(completion_difficulty_dict[key] for key in automated_tasks)
            machine_cost = machine_automation_cost * (AI_quality ** (-1 * completion_difficulty))
            total_cost += machine_cost * machine_automation_wage

    return total_cost

In [11]:
def plan_to_active_dict(execution_plan, n):
    # Initialize a dictionary with n keys, all set to False
    init_dict = {i: False for i in range(n)}

    # Iterate through each sublist and each item in the list of lists
    for sublist in execution_plan:
        for item in sublist:
            # Keep only digits in the item
            key = ''.join(filter(str.isdigit, str(item)))
            if key.isdigit():  # Check if key is a valid digit
                key = int(key)
                if key in init_dict:  # Ensure key is within dictionary range
                    init_dict[key] = True
    return init_dict

In [12]:
def my_execution_plan_sorter(execution_plan):
    # sort execution plan:
    # 1. sort each partition in ascending order
    # 2. sort partitions in ascending order
    output = sorted([sorted(inner) for inner in execution_plan], key=len)
    print(output)
    return output

In [13]:
def update_min_cost_vars(adjacency_matrix,
                         current_min_cost_plan, current_min_cost,
                         current_plan,
                         human_labor_dict, machine_labor_dict, machine_management_dict, 
                         management_difficulty_dict, completion_difficulty_dict,
                         AI_quality):
    
    # calculate cost of current plan
    current_plan_cost = compute_plan_cost(adjacency_matrix, current_plan, 
                                            human_labor_dict, machine_labor_dict, machine_management_dict, 
                                            management_difficulty_dict, completion_difficulty_dict,
                                            AI_quality)
                    
    # if current_plan has lower cost than current_min_cost, update current_min_cost and current_min_cost_plan
    if current_plan_cost < current_min_cost:

        ######################################################## 
        # if costs are the same break tie in favor of more automation?
        ########################################################

        current_min_cost_plan = current_plan
        current_min_cost = current_plan_cost
        # print('--------------------------------------')
        # print('*minimum-cost execution plan updated*')
        # print(f'new min cost plan: {current_min_cost_plan}')
        # print(f'new min cost: {current_min_cost}')

    return current_min_cost_plan, current_min_cost

In [14]:
import copy
def get_min_cost_plan(adjacency_matrix, 
                      human_labor_dict, machine_labor_dict, machine_management_dict, 
                      management_difficulty_dict, completion_difficulty_dict,
                      AI_quality = 1e-8):

    def compute_min_cost_recursive(adjacency_matrix, 
                                   neighbors_dict, active_dict, memory_dict, 
                                   current_node, current_plan,
                                   current_min_cost_plan, current_min_cost,
                                   AI_quality):
        
        ############################################################################################################################
        def extended_plan_cost_check_and_append(adjacency_matrix, 
                                                human_labor_dict, machine_labor_dict, machine_management_dict, 
                                                management_difficulty_dict, completion_difficulty_dict,
                                                current_plan, extended_plan,
                                                current_node, active_nodes_list_excluding_current_node, active_nodes_tuple,
                                                neighbors_dict, active_dict, memory_dict,
                                                neighbor,
                                                current_min_cost_plan, current_min_cost,
                                                execution_plans_list,
                                                AI_quality):
            # check if need to pursue this plan
            extended_plan_cost = compute_plan_cost(adjacency_matrix, current_plan, 
                                                    human_labor_dict, machine_labor_dict, machine_management_dict, 
                                                    management_difficulty_dict, completion_difficulty_dict,
                                                    AI_quality)
            if extended_plan_cost < current_min_cost:
                # get acitve_dict for extended_plan
                active_dict = plan_to_active_dict(extended_plan, n)

                # calculate min cost and min cost plan for extended plan
                current_min_cost_plan , current_min_cost, execution_plan = compute_min_cost_recursive(adjacency_matrix, 
                                                                                                    neighbors_dict, active_dict, memory_dict, 
                                                                                                    [neighbor], extended_plan,
                                                                                                    current_min_cost_plan, current_min_cost,
                                                                                                    AI_quality)
                # sparse execution plan to update memory dict
                for plan in execution_plan:
                    exhausted_tasks = [item for sublist in plan for item in sublist]
                    if len(exhausted_tasks) != n:
                        continue

                    current_node_index = next(i for i, sublist in enumerate(plan) if current_node[0] in sublist)
                    extension = plan[current_node_index:]
                    extension[0] = [item for item in extension[0] if item not in active_nodes_list_excluding_current_node]

                    # update memory dict
                    memory_dict[(active_nodes_tuple, tuple(current_node))].append(extension)

                    # append plan to execution_plans_list
                    execution_plans_list.append(plan)

            return current_min_cost_plan, current_min_cost, execution_plans_list
        ############################################################################################################################



        # "obvious" stopping rules: 
        # 1) if current_min_cost_plan says automate all tasks then cannot do better than this
        if len(current_min_cost_plan) == 1:
            return current_min_cost_plan, current_min_cost, [current_plan]
        
        # 2) if current_plan has higher cost than current_min_cost then break (becomes important for neighbors of [0])
        current_plan_cost = compute_plan_cost(adjacency_matrix, current_plan, 
                                              human_labor_dict, machine_labor_dict, machine_management_dict, 
                                              management_difficulty_dict, completion_difficulty_dict,
                                              AI_quality)
        if current_min_cost < current_plan_cost:
            return current_min_cost_plan, current_min_cost, [current_plan]
        
        
        # get active nodes list
        active_nodes_list = [key for key, value in active_dict.items() if value == True]
        active_nodes_list_excluding_current_node = [item for item in active_nodes_list if item not in current_node]
        active_nodes_tuple = tuple(active_nodes_list_excluding_current_node)

        # get inactive neighbors of current_plan nodes
        current_plan_nodes = [item for sublist in current_plan for item in sublist]
        neighbors_list = list(dict.fromkeys([value for key in current_plan_nodes if key in neighbors_dict for value in neighbors_dict[key]]))

        inactive_neighbors_list = [neighbor for neighbor in neighbors_list if active_dict[neighbor] == False]


        # if continuution of plan already calculated return it from memory
        try:
            if len(memory_dict[(active_nodes_tuple, tuple(current_node))]) > 0:

                if len(inactive_neighbors_list) == 0:
                    # if current_node is a last node, return current plan
                    if memory_dict[(active_nodes_tuple, tuple(current_node))][0] == []:
                        current_min_cost_plan, current_min_cost = update_min_cost_vars(adjacency_matrix,
                                                                                       current_min_cost_plan, current_min_cost,
                                                                                       current_plan,
                                                                                       human_labor_dict, machine_labor_dict, machine_management_dict, 
                                                                                       management_difficulty_dict, completion_difficulty_dict,
                                                                                       AI_quality)
                        return current_min_cost_plan, current_min_cost, [current_plan]
                    else:
                        # generate all possible execution plans given extensions of current node
                        execution_plans_list = []
                        for extension in memory_dict[(active_nodes_tuple, tuple(current_node))]:
                            # to generate execution plan:
                            # 1) remove current node from last partition of current plan
                            # 2) extend modified last partition of current plan with first partition of current extension
                            # 3) add remaining partitions of current extension to the modified current plan
                            modified_current_plan_last_partition = copy.deepcopy(current_plan[-1])
                            modified_current_plan_last_partition.remove(current_node[0])
                            extension_first_partition = copy.deepcopy(extension[0])
                            modified_current_plan_last_partition += extension_first_partition

                            # create execution plan
                            execution_plan = current_plan[:-1] + [modified_current_plan_last_partition] + extension[1:]
                            execution_plans_list.append(execution_plan)

                            current_min_cost_plan, current_min_cost = update_min_cost_vars(adjacency_matrix,
                                                                                           current_min_cost_plan, current_min_cost,
                                                                                           execution_plan,
                                                                                           human_labor_dict, machine_labor_dict, machine_management_dict, 
                                                                                           management_difficulty_dict, completion_difficulty_dict,
                                                                                           AI_quality)

                        return current_min_cost_plan, current_min_cost, execution_plans_list
                    
                else:
                    # print(f'execution_plans_list: {execution_plans_list}')
                    # print(f'current_plan: {current_plan}')

                    execution_plans_list = []
                    for neighbor in inactive_neighbors_list:
                        # print(f'inactive neighbor [{neighbor}] of current plan {current_plan} **********************************************')


                        #################################################################################################################################
                        #################################################################################################################################
                        
                        # extend current_plan to include inactive neighbor
                        # v1: add inactive neighbor as singletion partition to current_plan
                        extended_plan = copy.deepcopy(current_plan)
                        extended_plan += [[neighbor]]
                        # extended_plan = [sorted(sublist) for sublist in extended_plan]


            
                        current_min_cost_plan, current_min_cost, execution_plans_list = extended_plan_cost_check_and_append(adjacency_matrix, 
                                                                                                                            human_labor_dict, machine_labor_dict, machine_management_dict, 
                                                                                                                            management_difficulty_dict, completion_difficulty_dict,
                                                                                                                            current_plan, extended_plan,
                                                                                                                            current_node, active_nodes_list_excluding_current_node, active_nodes_tuple,
                                                                                                                            neighbors_dict, active_dict, memory_dict,
                                                                                                                            neighbor,
                                                                                                                            current_min_cost_plan, current_min_cost,
                                                                                                                            execution_plans_list,
                                                                                                                            AI_quality)

                            
                        # v2: extend last partition in current_plan by adding inactive neighbor to it
                        extended_plan = copy.deepcopy(current_plan)
                        extended_plan[-1].append(neighbor)
                        # extended_plan = [sorted(sublist) for sublist in extended_plan]

                        # check validity of extended_plan
                        # (v2) extentions may not form a "valid" partition; if extension not valid skip 
                        if not validate_partition_using_connectedness(adjacency_matrix, extended_plan[-1]):
                            aaa = 1
                        else:
                            current_min_cost_plan, current_min_cost, execution_plans_list = extended_plan_cost_check_and_append(adjacency_matrix, 
                                                                                                                                human_labor_dict, machine_labor_dict, machine_management_dict, 
                                                                                                                                management_difficulty_dict, completion_difficulty_dict,
                                                                                                                                current_plan, extended_plan,
                                                                                                                                current_node, active_nodes_list_excluding_current_node, active_nodes_tuple,
                                                                                                                                neighbors_dict, active_dict, memory_dict,
                                                                                                                                neighbor,
                                                                                                                                current_min_cost_plan, current_min_cost,
                                                                                                                                execution_plans_list,
                                                                                                                                AI_quality)

                    # hacky way of fixing no continuuation plan for current node:
                    if len(memory_dict[(active_nodes_tuple, tuple(current_node))]) == 0:
                        memory_dict[(active_nodes_tuple, tuple(current_node))] = [[]]
                    
                    return current_min_cost_plan , current_min_cost, execution_plans_list
                    #################################################################################################################################
                    #################################################################################################################################







            
        # if partition not in memory, get valid subsets of partition
        except KeyError:
            # initialize memory dict key for current node
            memory_dict[(active_nodes_tuple, tuple(current_node))] = []

            if len(inactive_neighbors_list) == 0:
                # populate memory dict with current extension of inactive nodes
                extension = copy.deepcopy(current_plan[-1])
                extension.remove(current_node[0])
                inactive_extension = [item for item in extension if item not in active_nodes_list]
                memory_dict[(active_nodes_tuple, tuple(current_node))].append(inactive_extension)

                # update min cost vars
                current_min_cost_plan, current_min_cost = update_min_cost_vars(adjacency_matrix,
                                                                               current_min_cost_plan, current_min_cost,
                                                                               current_plan,
                                                                               human_labor_dict, machine_labor_dict, machine_management_dict, 
                                                                               management_difficulty_dict, completion_difficulty_dict,
                                                                               AI_quality)
                
                return current_min_cost_plan, current_min_cost, [current_plan]
            
            
            execution_plans_list = []
            for neighbor in inactive_neighbors_list:

                # extend current_plan to include inactive neighbor
                # v1: add inactive neighbor as singletion partition to current_plan
                extended_plan = copy.deepcopy(current_plan)
                extended_plan += [[neighbor]]
                # extended_plan = [sorted(sublist) for sublist in extended_plan]


    
                current_min_cost_plan, current_min_cost, execution_plans_list = extended_plan_cost_check_and_append(adjacency_matrix, 
                                                                                                                    human_labor_dict, machine_labor_dict, machine_management_dict, 
                                                                                                                    management_difficulty_dict, completion_difficulty_dict,
                                                                                                                    current_plan, extended_plan,
                                                                                                                    current_node, active_nodes_list_excluding_current_node, active_nodes_tuple,
                                                                                                                    neighbors_dict, active_dict, memory_dict,
                                                                                                                    neighbor,
                                                                                                                    current_min_cost_plan, current_min_cost,
                                                                                                                    execution_plans_list,
                                                                                                                    AI_quality)

                    
                # v2: extend last partition in current_plan by adding inactive neighbor to it
                extended_plan = copy.deepcopy(current_plan)
                extended_plan[-1].append(neighbor)
                # extended_plan = [sorted(sublist) for sublist in extended_plan]

                # check validity of extended_plan
                # (v2) extentions may not form a "valid" partition; if extension not valid skip 
                if not validate_partition_using_connectedness(adjacency_matrix, extended_plan[-1]):
                    aaa = 1
                else:
                    current_min_cost_plan, current_min_cost, execution_plans_list = extended_plan_cost_check_and_append(adjacency_matrix, 
                                                                                                                        human_labor_dict, machine_labor_dict, machine_management_dict, 
                                                                                                                        management_difficulty_dict, completion_difficulty_dict,
                                                                                                                        current_plan, extended_plan,
                                                                                                                        current_node, active_nodes_list_excluding_current_node, active_nodes_tuple,
                                                                                                                        neighbors_dict, active_dict, memory_dict,
                                                                                                                        neighbor,
                                                                                                                        current_min_cost_plan, current_min_cost,
                                                                                                                        execution_plans_list,
                                                                                                                        AI_quality)

            # hacky way of fixing no continuuation plan for current node:
            if len(memory_dict[(active_nodes_tuple, tuple(current_node))]) == 0:
                memory_dict[(active_nodes_tuple, tuple(current_node))] = [[]]

            return current_min_cost_plan , current_min_cost, execution_plans_list
    
    # subset adjacency matrix to exclude Target node
    non_target_adjacency_matrix = adjacency_matrix[:-1,:-1].copy()
    
    # get neighbors of nodes
    neighbors_dict = find_neighbors(non_target_adjacency_matrix)
    
    # get number of non-Target nodes
    n = non_target_adjacency_matrix.shape[0]

    # create active dictionary
    active_dict = plan_to_active_dict([[0]], n)
    
    # initialize dict for valid subsets of nodes (and also partitions) to act as memory
    memory_dict = {}

    # initialize values and run function
    current_min_cost_plan = []
    current_min_cost = float('inf')
    min_cost_plan, min_cost, _ = compute_min_cost_recursive(adjacency_matrix, 
                                                         neighbors_dict, active_dict, memory_dict, 
                                                         [0], [[0]],
                                                         current_min_cost_plan, current_min_cost,
                                                         AI_quality)
    
    return min_cost_plan, min_cost, memory_dict

In [15]:
def DAG_costMin(input_path, alpha, num_tasks_current):
    # read DAG
    dag_df = pd.read_csv(input_path)

    # remove edges if comment column labeled with "TriangleRemovedFlag" (edge is there for plotting purposes and is not part of the actual DAG)
    if 'comment' in dag_df.columns:
        dag_df = dag_df[~dag_df['comment'].str.endswith('TriangleRemovedFlag')]



    # get task stats
    tasks_stats = pd.read_csv(f'{occupation_folder}/{occupation}_taskStats.csv')



    # extract list of tasks and create a dictionary for indexing tasks
    tasks_list = tasks_stats['task'].unique()
    tasks_dict = {i: node for i, node in enumerate(tasks_list, start=0)}

    # create numpy array of adjacency matrix
    adjacency_matrix = np.zeros((len(tasks_list), len(tasks_list)), dtype=int)
    aux_dict = {value: key for key, value in tasks_dict.items()}
    for _, row in dag_df.iterrows():
        source_index = aux_dict[row['source']]
        target_index = aux_dict[row['target']]
        adjacency_matrix[source_index, target_index] = 1



    # add task_dict key and reset index
    aux_dict = {value: key for key, value in tasks_dict.items()}
    tasks_stats['dict_index'] = tasks_stats.apply(lambda row: aux_dict[row.task], axis=1)
    tasks_stats = tasks_stats.sort_values(by='dict_index')
    tasks_stats = tasks_stats.set_index('dict_index', drop=False)
    tasks_stats.index.name = None


    ############################################################################################################
    # generate DAG stats
    num_nodes = num_tasks_current # number of non-Target nodes in the DAG
    total_degree = len(dag_df) # total degree of the DAG
    average_degree = total_degree / num_nodes # average degree per node

    # path lengths to Target node
    all_paths_to_target_lens = calculate_all_path_lengths_to_target(adjacency_matrix)
    sum_paths_len_to_target = sum(all_paths_to_target_lens) # sum of path lengths to target node
    max_path_len_to_target = max(all_paths_to_target_lens) # max path length to target node
    avg_path_len_to_target = sum_paths_len_to_target / len(all_paths_to_target_lens) # avg path length to target node

    # sparsity
    sparsity = sparsity_calculator(adjacency_matrix) # sparsity of the DAG (1 - (number of edges / max possible edges))

    # store DAG stats
    DAG_stats = [num_nodes, total_degree, average_degree, 
                sum_paths_len_to_target, max_path_len_to_target, avg_path_len_to_target, 
                sparsity]
    ############################################################################################################


    # create dictionaries for human cost, management cost, and difficulty
    human_labor_dict = dict(zip(tasks_stats['dict_index'], tasks_stats['human_cost']))
    machine_labor_dict = dict(zip(tasks_stats['dict_index'], tasks_stats['machine_cost']))
    machine_management_dict = dict(zip(tasks_stats['dict_index'], tasks_stats['management_cost']))
    management_difficulty_dict = dict(zip(tasks_stats['dict_index'], tasks_stats['management_difficulty']))
    completion_difficulty_dict = dict(zip(tasks_stats['dict_index'], tasks_stats['completion_difficulty']))



    min_cost_plan, min_cost, memory_dict = get_min_cost_plan(adjacency_matrix, 
                                                            human_labor_dict, machine_labor_dict, machine_management_dict, 
                                                            management_difficulty_dict, completion_difficulty_dict,
                                                            AI_quality = alpha)

    # get number of execution plans covered (# of plans associated with the first node in memory_dict)
    num_execution_plans_covered = len(memory_dict[list(memory_dict.keys())[0]])

    return min_cost_plan, min_cost, num_execution_plans_covered, DAG_stats

In [16]:
import time
start_time = time.time()

# set alpha as AI quality metric
epsilon = 1e-8
alpha_list = [epsilon, 0.5, 1-epsilon, 1e5]

# set ONET data path
onet_data_path = f'{data_path}/data/onet_occupations_yearly.csv'

# list of occupations to create DAGs for
occupation_list = ['pileDriverOperators', 'dredgeOperators', 'gradersAndSortersForAgriculturalProducts',
                   'insuranceUnderwriters', 'insuranceAppraisersForAutoDamage', 'floorSandersAndFinishers', 
                   'reinforcingIronAndRebarWorkers', 'travelAgents', 'dataEntryKeyer', 
                   'athletesAndSportsCompetitors', 'audiovisualEquipmentInstallerAndRepairers', 'hearingAidSpecialists', 
                   'personalCareAides', 'proofreadersAndCopyMarkers', 'chiropractors', 
                   'shippingReceivingAndInventoryClerks', 'cooksShortOrder', 'orthodontists',
                   'subwayAndStreetcarOperators', 'packersAndPackagersHand', 'hoistAndWinchOperators', 
                   'forgingMachineSettersOperatorsAndTenders', 'avionicsTechnicians', 'dishwashers', 
                   'dispatchersExceptPoliceFireAndAmbulance', 'familyMedicinePhysicians', 'MachineFeedersAndOffbearers'
                   ]

# output stats dataframe path
stats_df_output_path = f'{data_path}/daily_tasks_occupations_analysis'

In [17]:
stats_df = pd.DataFrame()
for occupation in occupation_list:
    print(f'\n---------------------- Running: {occupation} ----------------------')
    occupation_start_time = time.time()

    # generate occupation-specific strings
    GPT_input_occupation, plot_title_occupation, occupation_code, occupation_folder = pick_occupation(occupation)


    # Get occupation tasks to create all possible partitions
    tasks = get_tasks(onet_data_path, occupation_code)
    num_tasks_current = len(tasks)
    print(f'Number of non-Target tasks: {num_tasks_current}')

    # Manual DAG
    M_input_path = f'{occupation_folder}/{occupation}_M_DAG_df.csv'

    # Naive DAG
    N_input_path = f'{occupation_folder}/{occupation}_N_GPT_DAG_df.csv'

    # Conditioned Naive DAG
    CN_input_path = f'{occupation_folder}/{occupation}_CN_GPT_DAG_df.csv'

    # First Last Task DAG
    FLT_input_path = f'{occupation_folder}/{occupation}_FLT_GPT_DAG_df.csv'

    # Conditioned First Last Task DAG
    CFLT_input_path = f'{occupation_folder}/{occupation}_CFLT_GPT_DAG_df.csv'

    # Partitioned DAG
    P_input_path = f'{occupation_folder}/{occupation}_P_GPT_DAG_df.csv'

    # Conditioned Partitioned DAG
    CP_input_path = f'{occupation_folder}/{occupation}_CP_GPT_DAG_df.csv'
    

    # create list of all DAGs
    if occupation in ['travelAgents', 'insuranceUnderwriters', 'pileDriverOperators']:
        DAG_indicator_list = ['Manual DAG', 'Naive DAG', 'Conditioned Naive DAG', 'First-Last Task DAG', 'Conditioned First-Last Task DAG', 'Partitioned DAG', 'Conditioned Partitioned DAG']
        input_paths_list = [M_input_path, N_input_path, CN_input_path, FLT_input_path, CFLT_input_path, P_input_path, CP_input_path]
    else:
        DAG_indicator_list = ['Naive DAG', 'Conditioned Naive DAG', 'First-Last Task DAG', 'Conditioned First-Last Task DAG', 'Partitioned DAG', 'Conditioned Partitioned DAG']
        input_paths_list = [N_input_path, CN_input_path, FLT_input_path, CFLT_input_path, P_input_path, CP_input_path]


    for DAG_indicator, input_path in zip(DAG_indicator_list, input_paths_list):
        for alpha in alpha_list:

            print(f'\n-------Running: {occupation} - {DAG_indicator} - {alpha}-------')
            
            DAG_start_time = time.time()
            min_cost_plan, min_cost, num_execution_plans_covered, DAG_stats = DAG_costMin(input_path, alpha, num_tasks_current)
            DAG_end_time = time.time()
            

            DAG_execution_time = DAG_end_time - DAG_start_time
            print(f"\n{occupation} {DAG_indicator} (alpha = {alpha}) runtime: {DAG_execution_time:.2f} seconds")
            
            # unpack stats and create a new dataset
            num_nodes, total_degree, average_degree, sum_paths_len_to_target, max_path_len_to_target, avg_path_len_to_target, sparsity = DAG_stats

            # assign stats into an auxiliary dataset
            aux_df = pd.DataFrame({'occupation': [occupation],
                                   'DAG_type': [DAG_indicator],
                                   'alpha': [alpha],
                                   'num_tasks': [num_nodes],
                                   'DAG_sparsity': [sparsity],
                                   'num_execution_plans_covered': [num_execution_plans_covered],
                                   'total_degree': [total_degree],
                                   'average_degree': [average_degree],
                                   'sum_paths_len_to_target': [sum_paths_len_to_target],
                                   'max_path_len_to_target': [max_path_len_to_target],
                                   'avg_path_len_to_target': [avg_path_len_to_target],
                                   'min_cost': [min_cost],
                                   'min_cost_plan': [min_cost_plan],
                                   'exec_time_sec': [DAG_execution_time],
                                   'exec_time_min': [DAG_execution_time/60],
                                   })
            
            # update master dataset
            stats_df = pd.concat([stats_df, aux_df], axis=0)
            stats_df.to_csv(f'{stats_df_output_path}/costMin_stats.csv', index=False)

    runtime_since_start = (time.time() - start_time)/60
    print(f"\nruntime since start: {runtime_since_start:.2f} minutes\n")

end_time = time.time()
execution_time = (end_time - start_time)/60
print(f"\n\nTotal Runtime: {execution_time:.2f} minutes")


---------------------- Running: pileDriverOperators ----------------------
Number of non-Target tasks: 5

-------Running: pileDriverOperators - Manual DAG - 1e-08-------

pileDriverOperators Manual DAG (alpha = 1e-08) runtime: 0.01 seconds

-------Running: pileDriverOperators - Manual DAG - 0.5-------

pileDriverOperators Manual DAG (alpha = 0.5) runtime: 0.01 seconds

-------Running: pileDriverOperators - Manual DAG - 0.99999999-------

pileDriverOperators Manual DAG (alpha = 0.99999999) runtime: 0.01 seconds

-------Running: pileDriverOperators - Manual DAG - 100000.0-------

pileDriverOperators Manual DAG (alpha = 100000.0) runtime: 0.01 seconds

-------Running: pileDriverOperators - Naive DAG - 1e-08-------

pileDriverOperators Naive DAG (alpha = 1e-08) runtime: 0.01 seconds

-------Running: pileDriverOperators - Naive DAG - 0.5-------

pileDriverOperators Naive DAG (alpha = 0.5) runtime: 0.01 seconds

-------Running: pileDriverOperators - Naive DAG - 0.99999999-------

pileDriver

KeyboardInterrupt: 