In [1]:
with open('libraries.py') as f:
    code = f.read()
exec(code)

with open('functions.py') as f:
    code = f.read()
exec(code)

In [2]:
pd.reset_option('all')
pd.set_option('display.max_rows', 100)

In [3]:
# determine user
user = getpass.getuser()
if user == 'peymansh':
    main_folder_path = '/Users/peymansh/Dropbox (MIT)/Research/AI and Occupations/ai-exposure'
    data_path = f'{main_folder_path}/output'

In [4]:
onet_data_path = f'{data_path}/data/onet_occupations_yearly.csv'

# list of occupations to create DAGs for
occupation_list = ['travelAgents', 'insuranceUnderwriters', 'pileDriverOperators', 
                   'dredgeOperators', 'gradersAndSortersForAgriculturalProducts', 'reinforcingIronAndRebarWorkers',
                   'insuranceAppraisersForAutoDamage', 'floorSandersAndFinishers', 'dataEntryKeyer', 
                   'athletesAndSportsCompetitors', 'audiovisualEquipmentInstallerAndRepairers', 'hearingAidSpecialists', 
                   'personalCareAides', 'proofreadersAndCopyMarkers', 'chiropractors', 
                   'shippingReceivingAndInventoryClerks', 'cooksShortOrder', 'orthodontists',
                   'subwayAndStreetcarOperators', 'packersAndPackagersHand', 'hoistAndWinchOperators', 
                   'forgingMachineSettersOperatorsAndTenders', 'avionicsTechnicians', 'dishwashers', 
                   'dispatchersExceptPoliceFireAndAmbulance', 'familyMedicinePhysicians', 'MachineFeedersAndOffbearers'
                   ]

occupation = 'travelAgents'
occupation = 'insuranceUnderwriters'
occupation = 'pileDriverOperators'

# Generate occupation-specific strings
GPT_input_occupation, plot_title_occupation, occupation_code, occupation_folder = pick_occupation(occupation)

In [5]:
# set alpha as AI quality metric
n = 100
epsilon = 1e-8
alpha_list = np.linspace(epsilon, 1-epsilon, n).tolist()

### Initialize input-output paths

In [6]:
# Manual DAG
input_path = f'{occupation_folder}/{occupation}_M_DAG_df.csv'
output_path = f'{occupation_folder}/{occupation}_costMin_M.csv'

# First Last Task DAG
input_path = f'{occupation_folder}/{occupation}_FLT_GPT_DAG_df.csv'
output_path = f'{occupation_folder}/{occupation}_costMin_FLT.csv'

# Conditioned First Last Task DAG
input_path = f'{occupation_folder}/{occupation}_CFLT_GPT_DAG_df.csv'
output_path = f'{occupation_folder}/{occupation}_costMin_CFLT.csv'

# Partitioned DAG
input_path = f'{occupation_folder}/{occupation}_P_GPT_DAG_df.csv'
output_path = f'{occupation_folder}/{occupation}_costMin_P.csv'

# Conditioned Partitioned DAG
input_path = f'{occupation_folder}/{occupation}_CP_GPT_DAG_df.csv'
output_path = f'{occupation_folder}/{occupation}_costMin_CP.csv'

In [7]:
# read DAG
dag_df = pd.read_csv(input_path)

# remove edges if comment column labeled with "TriangleRemovedFlag" (edge is there for plotting purposes and is not part of the actual DAG)
if 'comment' in dag_df.columns:
    dag_df = dag_df[~dag_df['comment'].str.endswith('TriangleRemovedFlag')]

# get task stats
tasks_stats = pd.read_csv(f'{occupation_folder}/{occupation}_taskStats.csv')
tasks_stats

# print stats
#tasks_stats.iloc[:,1:].sum()
dag_df

Unnamed: 0,source,target,comment
0,Move hand and foot levers of hoisting equipmen...,Drive pilings to provide support for buildings...,The worker driving the pilings needs to know t...
2,Drive pilings to provide support for buildings...,Move levers and turn valves to activate power ...,The worker operating the levers and valves to ...
3,Conduct pre-operational checks on equipment to...,"Clean, lubricate, and refill equipment.","The worker responsible for cleaning, lubricati..."
4,Conduct pre-operational checks on equipment to...,Move hand and foot levers of hoisting equipmen...,The worker moving hand and foot levers to posi...
5,"Clean, lubricate, and refill equipment.",Move hand and foot levers of hoisting equipmen...,The worker operating the hoisting equipment ne...
6,Conduct pre-operational checks on equipment to...,Drive pilings to provide support for buildings...,The worker driving pilings needs to know that ...
8,Conduct pre-operational checks on equipment to...,Move levers and turn valves to activate power ...,The worker who is moving levers and turning va...
10,"Clean, lubricate, and refill equipment.","""Target""",Job Completion Indicator
11,Move levers and turn valves to activate power ...,"""Target""",Job Completion Indicator


In [8]:
# extract list of tasks and create a dictionary for indexing tasks
tasks_list = tasks_stats['task'].unique()
tasks_dict = {i: node for i, node in enumerate(tasks_list, start=0)}

# create numpy array of adjacency matrix
adjacency_matrix = np.zeros((len(tasks_list), len(tasks_list)), dtype=int)
aux_dict = {value: key for key, value in tasks_dict.items()}
for _, row in dag_df.iterrows():
    source_index = aux_dict[row['source']]
    target_index = aux_dict[row['target']]
    adjacency_matrix[source_index, target_index] = 1

tasks_dict

{0: 'Move hand and foot levers of hoisting equipment to position piling leads, hoist piling into leads, and position hammers over pilings.',
 1: 'Conduct pre-operational checks on equipment to ensure proper functioning.',
 2: 'Drive pilings to provide support for buildings or other structures, using heavy equipment with a pile driver head.',
 3: 'Move levers and turn valves to activate power hammers, or to raise and lower drophammers that drive piles to required depths.',
 4: 'Clean, lubricate, and refill equipment.',
 5: '"Target"'}

In [9]:
def find_neighbors(adjacency_matrix):
    n = adjacency_matrix.shape[0]  # Number of nodes
    neighbors_dict = {}

    for node in range(n):
        neighbors = []
        for neighbor in range(n):
            if adjacency_matrix[node, neighbor] != 0:  # Check for an edge from node to neighbor
                neighbors.append(neighbor)
        neighbors_dict[node] = neighbors

    return neighbors_dict

In [10]:
def find_neighbors_undirected(adjacency_matrix):
    n = adjacency_matrix.shape[0]  # Number of nodes
    neighbors_dict = {}

    for node in range(n):
        neighbors = set()  # Use a set to avoid duplicate neighbors
        # Add neighbors from outgoing edges (row)
        for neighbor in range(n):
            if adjacency_matrix[node, neighbor] != 0:
                neighbors.add(neighbor)
        
        # Add neighbors from incoming edges (column)
        for neighbor in range(n):
            if adjacency_matrix[neighbor, node] != 0:
                neighbors.add(neighbor)
        
        neighbors_dict[node] = list(neighbors)

    return neighbors_dict

In [11]:
neighbors = find_neighbors(adjacency_matrix[:-1,:-1])
neighbors

{0: [2], 1: [0, 2, 3, 4], 2: [3], 3: [], 4: [0]}

In [12]:

neighbors = find_neighbors_undirected(adjacency_matrix[:-1,:-1])
neighbors

{0: [1, 2, 4], 1: [0, 2, 3, 4], 2: [0, 1, 3], 3: [1, 2], 4: [0, 1]}

In [13]:
def get_valid_DAG_subsets(adjacency_matrix):

    # get neighbors of each task (excluding Target node)
    tasks_neighbors = find_neighbors(adjacency_matrix[:-1,:-1])
    #tasks_neighbors = find_neighbors_undirected(adjacency_matrix[:-1,:-1])

    # get number of non-Target nodes
    n = adjacency_matrix.shape[0] - 1

    def valid_subsets_recursive(node):
        # if valid subsets for neighbor are already computed, use them
        if len(memory_dict[node]) > 0:
            return memory_dict[node]
        

        # if node is terminal node (i.e., has no neighbors) return node itself as singleton
        if len(tasks_neighbors[node]) == 0:
            memory_dict[node] = [[node]]
            return memory_dict[node]
        

        # initialize list for valid subsets in all neighbors
        valid_subsets = []

        # for each neighbor get valid subsets
        for neighbor in tasks_neighbors[node]:
            #print(f'\nneighbor of {node}: node {neighbor}')
            # initialize list for valid subsets in neighbor
            valid_neighbor_subsets = []

            # if neighbor is terminal node (i.e., has no neighbors) generate valid "terminal" subsets 
            if len(neighbors[neighbor]) == 0:
                #print(f'terminal node: {neighbor}')

                valid_neighbor_subsets = [[node, neighbor]]

                # add node itself as singleton to set of valid subsets
                valid_neighbor_subsets.append([node])
                
                #print(valid_neighbor_subsets)
            # if neighbor has neighbors, get valid subsets of neighbor and add node to each subset
            else:
                # get valid subsets of neighbor
                valid_neighbor_subsets = valid_subsets_recursive(neighbor)

                # add parent node to valid subsets of neighbor node
                valid_neighbor_subsets = [subset + [node] for subset in valid_neighbor_subsets]

                # add node itself as singleton to set of valid subsets
                valid_neighbor_subsets.append([node])

                #print(f'add node {node} to terminal subsets of neighbor node {neighbor}')
                #print(valid_neighbor_subsets)

            # add valid subsets of neighbor to valid subsets of all neighbors
            valid_subsets.extend(valid_neighbor_subsets)

        # update memory
        memory_dict[node] = valid_subsets
        print(f'memory dict for node {node}: {memory_dict}')
        
        # return unique elements of all_subsets
        unique_valid_subsets = [sorted(list(t)) for t in set(tuple(subset) for subset in valid_subsets)]
        unique_valid_subsets = sorted(unique_valid_subsets, key=len)
        
        return unique_valid_subsets
    

    # initialize dict for valid subsets to act as memory
    memory_dict = {key: [] for key in range(n)}

    # initialize dictionary for valid subsets origniating from each node
    valid_subsets_dict = {}
    for node in range(n):
        valid_subsets_dict[node] = valid_subsets_recursive(node)

    return valid_subsets_dict


valid_subsets_dict = get_valid_DAG_subsets(adjacency_matrix)
valid_subsets_dict

memory dict for node 2: {0: [], 1: [], 2: [[3, 2], [2]], 3: [[3]], 4: []}
memory dict for node 0: {0: [[2, 0], [2, 3, 0], [0]], 1: [], 2: [[3, 2], [2]], 3: [[3]], 4: []}
memory dict for node 4: {0: [[2, 0], [2, 3, 0], [0]], 1: [], 2: [[3, 2], [2]], 3: [[3]], 4: [[2, 0, 4], [2, 3, 0, 4], [0, 4], [4]]}
memory dict for node 1: {0: [[2, 0], [2, 3, 0], [0]], 1: [[2, 0, 1], [2, 3, 0, 1], [0, 1], [1], [3, 2, 1], [2, 1], [1], [3, 1], [1], [4, 1], [0, 4, 1], [0, 2, 4, 1], [0, 2, 3, 4, 1], [1]], 2: [[3, 2], [2]], 3: [[3]], 4: [[2, 0, 4], [2, 3, 0, 4], [0, 4], [4]]}


{0: [[0], [0, 2], [0, 2, 3]],
 1: [[1],
  [0, 1],
  [1, 2],
  [1, 3],
  [1, 4],
  [0, 1, 4],
  [0, 1, 2],
  [1, 2, 3],
  [0, 1, 2, 3],
  [0, 1, 2, 4],
  [0, 1, 2, 3, 4]],
 2: [[3, 2], [2]],
 3: [[3]],
 4: [[2, 0, 4], [2, 3, 0, 4], [0, 4], [4]]}

In [14]:
example_adjacency_matrix = np.array([[0, 1, 1, 0, 0],
                                     [0, 0, 0, 1, 0],
                                     [0, 0, 0, 1, 0],
                                     [0, 0, 0, 0, 0],
                                     [0, 0, 0, 0, 0]])
valid_subsets_dict = get_valid_DAG_subsets(example_adjacency_matrix)
valid_subsets_dict

memory dict for node 1: {0: [], 1: [[3, 1], [1]], 2: [], 3: [[3]]}
memory dict for node 2: {0: [], 1: [[3, 1], [1]], 2: [[3, 2], [2]], 3: [[3]]}
memory dict for node 0: {0: [[1, 0], [1, 3, 0], [0], [2, 0], [2, 3, 0], [0]], 1: [[3, 1], [1]], 2: [[3, 2], [2]], 3: [[3]]}


{0: [[0], [0, 2], [0, 1], [0, 1, 3], [0, 2, 3]],
 1: [[3, 1], [1]],
 2: [[3, 2], [2]],
 3: [[3]]}

In [15]:
def is_combination_valid(combination, n):
    # Flatten list of combination
    covered_tasks_list = [element for sublist in combination for element in sublist]
    
    # Create a set of the flattened list
    covered_tasks_set = set(covered_tasks_list)
    
    # Check if the flattened set has exactly n elements and contains all elements from 0 to n-1
    if len(covered_tasks_list) == n and covered_tasks_set == set(range(n)):
        return True
    else:
        return False


def generate_combinations(valid_subsets_dict, current_key=0, current_combination=None, result=None):
    if current_combination is None:
        current_combination = []
    if result is None:
        result = []

    # Base case: if convered all tasks add current combination to the result list
    if is_combination_valid(current_combination, len(valid_subsets_dict)):
        print(f'********found valid combination: {current_combination}********')
        result.append(current_combination)
        return result

    # Recursive case: iterate through the list of lists at the current key
    for subset in valid_subsets_dict[current_key]:
        # Create a new combination including the current subset
        new_combination = current_combination + [subset]
        new_combination_flattened = [element for sublist in new_combination for element in sublist]

        # Check which nodes are NOT covered by the new combination. Only need to process these nodes next
        uncovered_nodes = list(set(range(len(valid_subsets_dict))) - set(new_combination_flattened))
        print(f'new combination: {new_combination}\n uncovered nodes: {uncovered_nodes}')
        #uncovered_nodes_valid_subsets_dict = {key: value for key, value in valid_subsets_dict.items() if key in uncovered_nodes}

        if len(uncovered_nodes) == 0:
            if is_combination_valid(new_combination, len(valid_subsets_dict)):
                print(f'********found valid combination: {new_combination}********\n')
                result.append(new_combination)
                return result
        else:
            # Recursively call the function to process the next key
            for nex_key in uncovered_nodes:
                generate_combinations(valid_subsets_dict, nex_key, new_combination, result)

    
    return result




In [16]:
combinations = generate_combinations(valid_subsets_dict)

new combination: [[0]]
 uncovered nodes: [1, 2, 3]
new combination: [[0], [3, 1]]
 uncovered nodes: [2]
new combination: [[0], [3, 1], [3, 2]]
 uncovered nodes: []
new combination: [[0], [3, 1], [2]]
 uncovered nodes: []
********found valid combination: [[0], [3, 1], [2]]********

new combination: [[0], [1]]
 uncovered nodes: [2, 3]
new combination: [[0], [1], [3, 2]]
 uncovered nodes: []
********found valid combination: [[0], [1], [3, 2]]********

new combination: [[0], [1], [3]]
 uncovered nodes: [2]
new combination: [[0], [1], [3], [3, 2]]
 uncovered nodes: []
new combination: [[0], [1], [3], [2]]
 uncovered nodes: []
********found valid combination: [[0], [1], [3], [2]]********

new combination: [[0], [3, 2]]
 uncovered nodes: [1]
new combination: [[0], [3, 2], [3, 1]]
 uncovered nodes: []
new combination: [[0], [3, 2], [1]]
 uncovered nodes: []
********found valid combination: [[0], [3, 2], [1]]********

new combination: [[0], [2]]
 uncovered nodes: [1, 3]
new combination: [[0], [

In [17]:
print(len(combinations))
combinations

14


[[[0], [3, 1], [2]],
 [[0], [1], [3, 2]],
 [[0], [1], [3], [2]],
 [[0], [3, 2], [1]],
 [[0], [2], [3, 1]],
 [[0], [2], [3], [1]],
 [[0], [3], [1], [2]],
 [[0], [3], [2], [1]],
 [[0, 2], [3, 1]],
 [[0, 2], [3], [1]],
 [[0, 1], [3, 2]],
 [[0, 1], [3], [2]],
 [[0, 1, 3], [2]],
 [[0, 2, 3], [1]]]

In [18]:
def normalize_sublist(sublist):
    # Sort the elements within each inner list and then sort the entire sublist
    return tuple(sorted(tuple(sorted(inner)) for inner in sublist))

def unique_lists(input_list):
    seen = set()
    unique_combinations = []

    for sublist in input_list:
        normalized = normalize_sublist(sublist)
        if normalized not in seen:
            seen.add(normalized)
            unique_combinations.append(sublist)

    return unique_combinations

unique_combinations = unique_lists(combinations)
print(len(unique_combinations))
unique_combinations

9


[[[0], [3, 1], [2]],
 [[0], [1], [3, 2]],
 [[0], [1], [3], [2]],
 [[0, 2], [3, 1]],
 [[0, 2], [3], [1]],
 [[0, 1], [3, 2]],
 [[0, 1], [3], [2]],
 [[0, 1, 3], [2]],
 [[0, 2, 3], [1]]]

### Generate all possible partition schemes for the set of tasks (ignoring structre of the DAG)

In [19]:
from itertools import combinations

def partitions(set_):
    if not set_:
        yield []
        return
    for i in range(1, len(set_) + 1):
        for part in combinations(set_, i):
            remaining = set(set_) - set(part)
            if not remaining:
                yield [list(part)]
            else:
                for b in partitions(list(remaining)):
                    yield [list(part)] + b

def generate_unique_partitions(numbers):
    all_partitions = set()
    for partition in partitions(numbers):
        # Create a frozenset of frozensets to make each partition hashable and order-independent
        partition_set = frozenset(frozenset(part) for part in partition)
        all_partitions.add(partition_set)
    
    # Convert the frozensets back to lists for the final output
    unique_partitions = [list(map(list, partition)) for partition in all_partitions]

    # Sort elements
    unique_partitions = sorted([sorted(x) for x in unique_partitions], key=len)
    return unique_partitions

In [20]:
# Generate list of numbers for non-"Target" tasks in occupation
tasks_list_numbers = list(range(len(valid_subsets_dict)))

# Generate all possible partitioning schemes
all_partitions = generate_unique_partitions(tasks_list_numbers)


### Check if partition scheme is "valid" (i.e., if its non-singleton partitions are a connected graph)

In [21]:
def is_connected(matrix):
    # Number of nodes in the matrix
    num_nodes = matrix.shape[0]
    
    # Visited array to keep track of visited nodes
    visited = np.zeros(num_nodes, dtype=bool)
    
    # Helper function to perform DFS
    def dfs(node):
        visited[node] = True
        # Visit all the neighbors of the current node
        for neighbor in range(num_nodes):
            if matrix[node, neighbor] == 1 and not visited[neighbor]:
                dfs(neighbor)
            elif matrix[neighbor, node] == 1 and not visited[neighbor]:
                dfs(neighbor)
    
    # Start DFS from the first node (node 0)
    dfs(0)
    
    # If all nodes are visited, the matrix is connected
    return np.all(visited)


def validate_partition_using_connectedness(adjacency_matrix, tasks_list):
    # Return valid if Singleton
    if len(tasks_list) == 1:
        return True
    # Check if partition forms connected graph
    else:
        # Subset original adjacency matrix
        subset_matrix = adjacency_matrix[np.ix_(tasks_list, tasks_list)]

        # check if subset matrix is a connected graph
        subset_matrix_connected = is_connected(subset_matrix)

        # return true if connected and false otherwise
        return subset_matrix_connected

In [22]:
# Get valid partitioning schemes from all possible partitions to cut computation load
valid_partitions = []
for scheme in all_partitions:
    # Set valid partitions count to 0
    valid_partition_count = 0
    for partition in scheme:
        valid_partition = validate_partition_using_connectedness(example_adjacency_matrix, partition)
        if valid_partition:
            valid_partition_count += 1
    
    # If number of valid partitions within a partition scheme is equal to 
    # number of partitions in partition scheme then partition scheme is valid
    if valid_partition_count == len(scheme):
        valid_partitions.append(scheme)

# Print stats
print(f'Number of all possible partitioning schemes: {len(all_partitions)}')
print(f'Number of valid partitioning schemes given DAG structure: {len(valid_partitions)}')

for partition in valid_partitions:
    print(partition)

Number of all possible partitioning schemes: 15
Number of valid partitioning schemes given DAG structure: 12
[[0, 1, 2, 3]]
[[0, 2, 3], [1]]
[[0, 1], [2, 3]]
[[0, 2], [1, 3]]
[[0, 1, 2], [3]]
[[0], [1, 2, 3]]
[[0, 1, 3], [2]]
[[0, 2], [1], [3]]
[[0], [1, 3], [2]]
[[0, 1], [2], [3]]
[[0], [1], [2, 3]]
[[0], [1], [2], [3]]


## Missing in "Smart" method

In [23]:
def normalize_sublist(sublist):
    # Sort the elements within each inner list and then sort the entire sublist
    return tuple(sorted(tuple(sorted(inner)) for inner in sublist))

def list_difference(list1, list2):
    # Normalize both lists
    normalized_list1 = {normalize_sublist(sublist) for sublist in list1}
    normalized_list2 = {normalize_sublist(sublist) for sublist in list2}
    
    # Find the difference
    difference = normalized_list1 - normalized_list2
    
    # Convert the normalized tuples back to the original list format
    difference_list = []
    for norm_sublist in difference:
        original_sublist = [list(inner) for inner in norm_sublist]
        difference_list.append(original_sublist)
    
    return difference_list

result = list_difference(valid_partitions, unique_combinations)
for case in result:
    print(case)

[[0, 1, 2, 3]]
[[0, 1, 2], [3]]
[[0], [1, 2, 3]]
