In [399]:
with open('libraries.py') as f:
    code = f.read()
exec(code)

In [400]:
pd.reset_option('all')
pd.set_option('display.max_rows', 100)

In [401]:
# determine user
user = getpass.getuser()
if user == 'peymansh':
    main_folder_path = '/Users/peymansh/Dropbox (MIT)/Research/AI and Occupations/ai-exposure'
    data_path = f'{main_folder_path}/output'

In [402]:
# Pick occupation
def pick_occupation(occupation):
    if occupation == 'travelAgents':
        GPT_input_occupation = 'travel agents'
        plot_title_occupation = 'Travel Agents'
        occupation_code = '41-3041'
    elif occupation == 'insuranceUnderwriters':
        GPT_input_occupation = 'insurance underwriters'
        plot_title_occupation = 'Insurance Underwriters'
        occupation_code = '13-2053'
    
    occupation_folder = f'{data_path}/daily_tasks_occupations_analysis/{occupation}'
    return GPT_input_occupation, plot_title_occupation, occupation_code, occupation_folder


In [403]:
# set alpha as AI quality metric
alpha_list = np.linspace(0.3, 1-1e-4, 100).tolist()

In [404]:
# Pick occupation and initialize variables
occupation = 'travelAgents'
occupation = 'insuranceUnderwriters'

GPT_input_occupation, plot_title_occupation, occupation_code, occupation_folder = pick_occupation(occupation)

### Initialize input-output paths

In [405]:
suffix = 'MAX_' # for when partition cost used MAX of machine costs in partition (somewhat like a least common multiple)
suffix = ''

In [406]:
# Manual DAG
input_path = f'{occupation_folder}/{occupation}_manual_DAG_df.csv'
output_path = f'{occupation_folder}/{occupation}_costMin_{suffix}manual.csv'

# # First Last Task DAG
# input_path = f'{occupation_folder}/v1/{occupation}_firstLastTaskGPT_DAG_df.csv'
# output_path = f'{occupation_folder}/{occupation}_costMin_{suffix}firstLastTask.csv'

# Conditioned First Last Task DAG
input_path = f'{occupation_folder}/v1/{occupation}_conditionedGPT_fromFirstLastTask_DAG_df.csv'
output_path = f'{occupation_folder}/{occupation}_costMin_{suffix}firstLastTask_conditioned.csv'

# # Partitioned DAG
# input_path = f'{occupation_folder}/v1/{occupation}_partitionedGPT_DAG_df.csv'
# output_path = f'{occupation_folder}/{occupation}_costMin_{suffix}partitioned.csv'

# Conditioned Partitioned DAG
input_path = f'{occupation_folder}/v1/{occupation}_conditionedGPT_fromPartitioned_DAG_df.csv'
output_path = f'{occupation_folder}/{occupation}_costMin_{suffix}partitioned_conditioned.csv'

In [407]:
# read DAG
dag_df = pd.read_csv(input_path)

# extract list of tasks and create a dictionary for indexing tasks
tasks_list = list(set(dag_df['source']).union(set(dag_df['target'])))
tasks_dict = {i: node for i, node in enumerate(tasks_list, start=0)}

# create numpy array of adjacency matrix
adjacency_matrix = np.zeros((len(tasks_list), len(tasks_list)), dtype=int)

# Populate the adjacency matrix
aux_dict = {value: key for key, value in tasks_dict.items()}
for _, row in dag_df.iterrows():
    source_index = aux_dict[row['source']]
    target_index = aux_dict[row['target']]
    adjacency_matrix[source_index, target_index] = 1

In [408]:
# get task stats
tasks_stats = pd.read_csv(f'{occupation_folder}/{occupation}_taskStats.csv')

# define a break-even difficulty for base AI quality (alpha)
# above break-even difficulty threshold task is done manually
# as AI quality (alpha) goes up break-even difficulty goes up
for alpha in alpha_list:
    if alpha*10 % 1 != 0:
        continue
    tasks_stats[f'be_difficulty_{str(alpha)[-1]}'] = np.log(tasks_stats['machine_cost'] / tasks_stats['human_cost']) / np.log(alpha)
tasks_stats

Unnamed: 0,task,human_cost,machine_cost,difficulty,be_difficulty_3
0,Decline excessive risks.,60,30,8,0.575717
1,"Write to field representatives, medical person...",30,15,7,0.575717
2,Evaluate possibility of losses due to catastro...,240,60,10,1.151433
3,Decrease value of policy when risk is substand...,60,30,10,0.575717
4,Review company records to determine amount of ...,120,30,8,1.151433
5,Authorize reinsurance of policy when risk is h...,60,30,12,0.575717
6,Examine documents to determine degree of risk ...,60,30,10,0.575717


In [409]:
# add task_dict key for indexing purposes
aux_dict = {value: key for key, value in tasks_dict.items()}
tasks_stats['dict_index'] = tasks_stats.apply(lambda row: aux_dict[row.task], axis=1)
tasks_stats = tasks_stats.sort_values(by='dict_index')

# create dictionaries for human cost, machine cost, and difficulty
M_dict = dict(zip(tasks_stats['dict_index'], tasks_stats['human_cost']))
A_dict = dict(zip(tasks_stats['dict_index'], tasks_stats['machine_cost']))
D_dict = dict(zip(tasks_stats['dict_index'], tasks_stats['difficulty']))

# print stats
tasks_stats.iloc[:,1:-1].sum()

human_cost         630.00000
machine_cost       225.00000
difficulty          65.00000
be_difficulty_3      5.18145
dtype: float64

### Generate all possible partitions for the set of tasks (ignoring structre of the DAG)

In [410]:
from itertools import combinations

def partitions(set_):
    if not set_:
        yield []
        return
    for i in range(1, len(set_) + 1):
        for part in combinations(set_, i):
            remaining = set(set_) - set(part)
            if not remaining:
                yield [list(part)]
            else:
                for b in partitions(list(remaining)):
                    yield [list(part)] + b

def generate_unique_partitions(numbers):
    all_partitions = set()
    for partition in partitions(numbers):
        # Create a frozenset of frozensets to make each partition hashable and order-independent
        partition_set = frozenset(frozenset(part) for part in partition)
        all_partitions.add(partition_set)
    
    # Convert the frozensets back to lists for the final output
    unique_partitions = [list(map(list, partition)) for partition in all_partitions]

    # Sort elements
    unique_partitions = sorted([sorted(x) for x in unique_partitions], key=len)
    return unique_partitions

### Check if partition is "valid"
#### Partition is called valid if the partition subset of the DAG contains no singleton node and a path exists between a first node to all last nodes

In [411]:
# Function to check if a path exists using BFS (Breadth-First Search) in the subset matrix
def bfs_path_exists(matrix, start, goal):
    from collections import deque

    visited = [False] * len(matrix)
    queue = deque([start])
    
    while queue:
        current = queue.popleft()
        if current == goal:
            return True
        
        for neighbor, connected in enumerate(matrix[current]):
            if connected and not visited[neighbor]:
                visited[neighbor] = True
                queue.append(neighbor)

    return False


def validate_partition(adjacency_matrix, tasks_list):
    # Return valid if Singleton
    if len(tasks_list) == 1:
        return True

    # Subset original adjacency matrix
    subset_matrix = adjacency_matrix[np.ix_(tasks_list, tasks_list)]

    
    first_tasks = []
    last_tasks = []
    for task in tasks_list:
        # subset of original task in subsetted matrix
        subset_index = tasks_list.index(task)

        row_check = np.all(subset_matrix[subset_index, :] == 0)
        column_check = np.all(subset_matrix[:, subset_index] == 0)

        # Step 1: declare invalid if singleton task (task w/o incoming or outgoing edges in partition) present
        if row_check and column_check: # task is a singleton
            return False
        
        # Step 2: find first/last tasks (defined as tasks with no incoming/outgoing edges withing partition)
        if row_check: # no outgoing edge within partition means last task
            last_tasks.append(task)
        if column_check: # no incoming edge within partition means first task
            first_tasks.append(task)

    #print(f'First Tasks: {first_tasks}')
    #print(f'Last Tasks: {last_tasks}')

    # Step 3: ensure a path between last tasks and a first task exists
    counter = 0
    for last in last_tasks:
        subset_index_last = tasks_list.index(last)
        for first in first_tasks:
            subset_index_first = tasks_list.index(first)
            path_exists = bfs_path_exists(subset_matrix, subset_index_first, subset_index_last)
            if path_exists:
                counter += 1
                break # break inner loop (first_tasks loop)
    
    if counter == len(last_tasks): # if all last tasks have a path
        return True

In [412]:
# Generate list of numbers for tasks in occupation
tasks_list_numbers = list(range(len(tasks_list)))

# Generate all possible partitioning schemes
all_partitions = generate_unique_partitions(tasks_list_numbers)

# Get valid partitioning schemes
valid_partitions = []
for partition_scheme in all_partitions:

    # Set valid partitions count to 0
    valid_partition_count = 0
    for partition in partition_scheme:
        valid_partition = validate_partition(adjacency_matrix, partition)
        if valid_partition:
            valid_partition_count += 1
    
    # If number of valid partitions within a partition scheme is equal to 
    # number of partitions in partition scheme then partition scheme is valid
    if valid_partition_count == len(partition_scheme):
        valid_partitions.append(partition_scheme)

# Print stats
print(f'Number of all possible partitioning schemes: {len(all_partitions)}')
print(f'Number of valid partitioning schemes given DAG structure: {len(valid_partitions)}')

# print some partitions
print('\nExample partitions:')
for partition in valid_partitions[40:45]:
    print(partition)

Number of all possible partitioning schemes: 877
Number of valid partitioning schemes given DAG structure: 766

Example partitions:
[[0, 2, 3, 4, 6], [1, 5]]
[[0, 1, 3, 4], [2], [5, 6]]
[[0, 1, 2], [3, 5], [4, 6]]
[[0, 1, 2, 6], [3], [4, 5]]
[[0, 1, 3, 4], [2, 5], [6]]


### Compute minimum cost for a given partition

In [413]:
def compute_partition_cost(M_dict, A_dict, D_dict, AI_quality, partition):
    # calculate manual cost of doing partition
    manual_cost = sum(M_dict[key] for key in partition)
    #print(f'Manual cost: {manual_cost}')

    # calculate automation cost of doing partition
    AI_cost = sum(A_dict[key] for key in partition)
    #AI_cost = max(A_dict[key] for key in partition)
    difficulty = sum(D_dict[key] for key in partition)
    automation_cost = AI_cost * (AI_quality ** (-1 * difficulty))
    #print(f'Automation cost: {automation_cost}')

    # initialize partition done manually as False 
    # (only if partition is singleton and manual cost <= automated cost partition is done manually)
    partition_done_manually = False

    # if partition is a singleton do nothing
    if len(partition) == 1:
        partition_is_valid = True
        if manual_cost < automation_cost:
            partition_cost = manual_cost
            partition_done_manually = True 
        else:
            partition_cost = automation_cost
    
    # if partition not a singleton check if manual cost of doing multiple tasks lower than automating them
    else:
        # sanity check: if manual cost < automation cost partition is invalid (should not have been formed)
        if manual_cost < automation_cost:
            partition_cost = 100000000 # (value doesn't matter)
            partition_is_valid = False
        else:
            partition_cost = automation_cost
            partition_is_valid = True

    return partition_cost, partition_done_manually, partition_is_valid


### Compute costs of all "valid" plans
#### Check for new validity condition: automated cost of tasks in partition must be less than the human costs of not deploying any machines

In [414]:
def execute_plans(valid_partitions, M_dict, A_dict, D_dict, alpha):
    execution_plan = []
    execution_plan_manual_tasks = []
    execution_cost = []
    counter = 0
    for partition_scheme in valid_partitions:
        # initialize partition scheme cost
        # and partitions that are done manually
        partition_scheme_cost = 0
        manual_partitions = []
        
        for partition in partition_scheme:
            # calculate partition cost 
            partition_cost, partition_done_manually, partition_is_valid = compute_partition_cost(M_dict, A_dict, D_dict, alpha, partition)
        
            # if (automated) partition is invalid ignore partition scheme
            # and stop calculating costs of further partitions
            if not partition_is_valid:
                break

            if partition_done_manually:
                manual_partitions.append(partition)

            # if (automated) partition passes sanity check
            # add this partition's cost to partition scheme cost
            partition_scheme_cost += partition_cost
        
        # if stopped because an (automated) partition wasn't valid
        # ignore current partition scheme and continue
        if not partition_is_valid:
            continue
        
        # if partition scheme makes sense append costs
        execution_plan.append(partition_scheme)
        execution_plan_manual_tasks.append(manual_partitions)
        execution_cost.append(partition_scheme_cost)

        # if counter % (np.floor(len(valid_partitions)/3)) == 0:
        #     print(partition_scheme)
        #     print(partition_scheme_cost)
        #     print('\n')
        # counter += 1

    return execution_plan, execution_plan_manual_tasks, execution_cost


random.seed(1)
execution_plan, execution_plan_manual_tasks, execution_cost = execute_plans(valid_partitions, M_dict, A_dict, D_dict, alpha)
print(f'Number of valid execution plans: {len(execution_plan)}')

# print some valid execution plans
print('\nExample Execution Plans:')
for plan in execution_plan[10:15]:
    print(plan)

Number of valid execution plans: 766

Example Execution Plans:
[[0, 1, 2, 3, 5], [4, 6]]
[[0, 1], [2, 3, 4, 5, 6]]
[[0, 5, 6], [1, 2, 3, 4]]
[[0, 1, 4], [2, 3, 5, 6]]
[[0, 1, 2], [3, 4, 5, 6]]


### Calculate minimum cost for each alpha

In [415]:
random.seed(1)

minimum_cost_list = []
number_of_optimal_schemes_list = []
optimal_execution_plan_list = []
optimal_plan_manualTasks_list = []
optimal_plan_manualTasks_count_list = []
for alpha in alpha_list:
    # get list of execution plans and costs for this alpha
    execution_plan, execution_plan_manual_tasks, execution_cost = execute_plans(valid_partitions, M_dict, A_dict, D_dict, alpha)

    # choose minimum
    minimum_cost = min(execution_cost)
    minimum_cost_index = [index for index, value in enumerate(execution_cost) if value == minimum_cost]

    # in rare cases there are more than one optimal plan
    if len(minimum_cost_index) > 2:
        print(alpha)
        for index in minimum_cost_index:
            optimal_execution_scheme = execution_plan[index]
            optimal_execution_manual_tasks = execution_plan_manual_tasks[index]
            print(optimal_execution_scheme)
            print(optimal_execution_manual_tasks)
    else:
        optimal_execution_scheme = execution_plan[minimum_cost_index[0]]
        optimal_execution_manual_tasks = execution_plan_manual_tasks[minimum_cost_index[0]]
    
    # append lists
    minimum_cost_list.append(minimum_cost)
    number_of_optimal_schemes_list.append(len(minimum_cost_index))
    optimal_execution_plan_list.append(optimal_execution_scheme)
    optimal_plan_manualTasks_list.append(optimal_execution_manual_tasks)
    optimal_plan_manualTasks_count_list.append(len(optimal_execution_manual_tasks))

# save outputs
output_df = pd.DataFrame({
    'alpha': alpha_list,
    'optimal_schemes_count': number_of_optimal_schemes_list,
    'minimum_cost': minimum_cost_list,
    'optimal_scheme': optimal_execution_plan_list,
    'optimal_scheme_manual_tasks': optimal_plan_manualTasks_list,
    'manual_tasks_count': optimal_plan_manualTasks_count_list
})
output_df.to_csv(output_path, index=False)