# Load libraries and functions

In [360]:
with open('libraries.py') as f:
    code = f.read()
exec(code)

with open('functions.py') as f:
    code = f.read()
exec(code)

In [361]:
# determine user
user = getpass.getuser()
if user == 'peymansh':
    main_folder_path = '/Users/peymansh/Dropbox (MIT)/Research/AI and Occupations/ai-exposure'
    data_path = f'{main_folder_path}/output'

## Main Code

In [362]:
# Pick occupation and initialize variables
occupation = 'travelAgents'
occupation = 'insuranceUnderwriters'

GPT_input_occupation, plot_title_occupation, occupation_code, occupation_folder = pick_occupation(occupation)

In [363]:
# Load the data
onet = pd.read_csv(f'{data_path}/data/onet_occupations_yearly.csv')
onet = onet.sort_values(by=['year', 'occ_code', 'occ_title', 'task_id'])
onet = onet[onet['year'] == 2023].reset_index(drop=True)

# Get list of tasks
my_df = onet[(onet.occ_code == f'{occupation_code}') & (onet.year == 2023)]
tasks = my_df['task'].unique().tolist()

<br>

<br>

# 1) Manual DAG df

In [364]:
# Read manual adjacency matrix
manual_AM = pd.read_csv(f'{occupation_folder}/{occupation}_AM.csv', index_col=0)
#manual_AM = add_sink_node(manual_AM, occupation)

# Initialize lists to store the source and target nodes
sources = []
targets = []

# Iterate over the adjacency matrix to find ones and populate the lists
for row_label, row in manual_AM.iterrows():
    for col_label, value in row.items():
        if value == 1:
            sources.append(row_label)
            targets.append(col_label)

# Create data frame
manual_DAG_df = pd.DataFrame({'source': sources, 'target': targets})

# Remove "Sink" node for now
manual_DAG_df = manual_DAG_df[~manual_DAG_df.isin(['"Sink"']).any(axis=1)]

# Save output
manual_DAG_df.to_csv(f'{occupation_folder}/{occupation}_manual_DAG_df.csv', index=False)

<br>

<br>

# 2) GPT DAG df

### 2.1) One Step Method: Directly ask for pairwise comparison w/o giving the "either" option

In [365]:
# Compare pair of tasks
def task_relationships(occupation, tasks):
    if '"Sink"' in tasks:
        tasks.remove('"Sink"')

    scenarios = [Scenario({"occupation": occupation, "task_A": task_A, "task_B": task_B}) 
        for task_A, task_B in combinations(tasks, 2)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            And consider these two tasks: 
            A) {{ task_A }} 
            B) {{ task_B }}
            What is the relationship between these tasks?
            """),
        question_options = [
            "A would be done first", 
            "B would be done first",
            "Not part of the same task sequence"]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results

results = task_relationships(GPT_input_occupation, tasks)
#results.select("task_A", "task_B", "ordering", "comment.ordering_comment").print()
pairwise_relationships_wo_raw = results.select("task_A", "task_B", "ordering", "comment.ordering_comment").to_pandas()

Output()

In [366]:
# Swap columns and subset only those that are part of the same task sequence 
pairwise_relationships_wo = pairwise_relationships_wo_raw.copy()
mask = pairwise_relationships_wo['answer.ordering'] == 'B would be done first'
pairwise_relationships_wo.loc[mask, ['scenario.task_A', 'scenario.task_B']] = pairwise_relationships_wo.loc[mask, ['scenario.task_B', 'scenario.task_A']].values
pairwise_relationships_wo.loc[mask, 'answer.ordering'] = 'A would be done first'
pairwise_relationships_wo = pairwise_relationships_wo[pairwise_relationships_wo['answer.ordering']=='A would be done first']
pairwise_relationships_wo = pairwise_relationships_wo[['scenario.task_A', 'scenario.task_B', 'comment.ordering_comment']]

# Change column names
pairwise_relationships_wo = pairwise_relationships_wo.rename(columns={'scenario.task_A': 'source', 
                                                                      'scenario.task_B': 'target', 
                                                                      'comment.ordering_comment': 'comment'})

# Save output
pairwise_relationships_wo.to_csv(f'{occupation_folder}/{occupation}_oneStepGPT_DAG_df.csv', index=False)

### 2.2) Two Steps Method: Give option of "either" and then filter symmetric edges
### Step 1:

In [367]:
# Compare pair of tasks
def task_relationships(occupation, tasks):
    if '"Sink"' in tasks:
        tasks.remove('"Sink"')

    scenarios = [Scenario({"occupation": occupation, "task_A": task_A, "task_B": task_B}) 
        for task_A, task_B in combinations(tasks, 2)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            And consider these two tasks: 
            A) {{ task_A }} 
            B) {{ task_B }}
            What is the relationship between these tasks?
            """),
        question_options = [
            "A would be done first", 
            "B would be done first",
            "Could be done in either order, but still part of the same sequence",
            "Not part of the same task sequence"]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results

results = task_relationships(GPT_input_occupation, tasks)
#results.select("task_A", "task_B", "ordering", "comment.ordering_comment").print()
pairwise_relationships_w_raw = results.select("task_A", "task_B", "ordering", "comment.ordering_comment").to_pandas()

Output()

### Step 2:

In [368]:
# subset symmetric edges
both_edges = pairwise_relationships_w_raw[pairwise_relationships_w_raw['answer.ordering'] == 'Could be done in either order, but still part of the same sequence']
task_A_list = both_edges['scenario.task_A'].tolist()
task_B_list = both_edges['scenario.task_B'].tolist()


# Decide which one of symmetric edges to keep
def pick_oneOf_symmetricEdges(occupation, task_A_list, task_B_list):
    scenarios = [Scenario({"occupation": occupation, "task_A": task_A, "task_B": task_B}) 
        for task_A, task_B in zip(task_A_list, task_B_list)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            And consider these two tasks: 
            A) {{ task_A }} 
            B) {{ task_B }}
            What is the relationship between these tasks?
            """),
        question_options = [
            "A would be done first", 
            "B would be done first",]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results

results = pick_oneOf_symmetricEdges(GPT_input_occupation, task_A_list, task_B_list)
#results.select("task_A", "task_B", "ordering", "comment.ordering_comment").print()
which_symmetric_edge = results.select("task_A", "task_B", "ordering", "comment.ordering_comment").to_pandas()

Output()

In [369]:
# Merge datasets
pairwise_relationships_w = pairwise_relationships_w_raw[pairwise_relationships_w_raw['answer.ordering'].isin(['A would be done first', 'B would be done first'])]
pairwise_relationships_w = pd.concat([pairwise_relationships_w, which_symmetric_edge], ignore_index=True)

# Swap columns
mask = pairwise_relationships_w['answer.ordering'] == 'B would be done first'
pairwise_relationships_w.loc[mask, ['scenario.task_A', 'scenario.task_B']] = pairwise_relationships_w.loc[mask, ['scenario.task_B', 'scenario.task_A']].values
pairwise_relationships_w.loc[mask, 'answer.ordering'] = 'A would be done first'
pairwise_relationships_w = pairwise_relationships_w[pairwise_relationships_w['answer.ordering']=='A would be done first']
pairwise_relationships_w = pairwise_relationships_w[['scenario.task_A', 'scenario.task_B', 'comment.ordering_comment']]

# Change column names
pairwise_relationships_w = pairwise_relationships_w.rename(columns={'scenario.task_A': 'source', 
                                                                    'scenario.task_B': 'target', 
                                                                    'comment.ordering_comment': 'comment'})

# Save output
pairwise_relationships_w.to_csv(f'{occupation_folder}/{occupation}_twoStepGPT_DAG_df.csv', index=False)

<br>

<br>

# 3) GPT First Last Task Method df

### Use One Step Method: Directly ask for pairwise comparison w/o giving the "either" option
### Next determine first and last task/tasks to be done in the sequence and ask GPT to produce DAG

In [370]:
def first_last_tasks(occupation, tasks):
    # Remove "Sink" node if it exists
    if '"Sink"' in tasks:
        tasks.remove('"Sink"')

    scenarios = [Scenario({"occupation": GPT_input_occupation, "tasks": tasks})]

    # First task
    q1 = QuestionCheckBox(
        question_name = "firstTask",
        question_text = dedent("""\
            Consider {{ occupation }}.
            The tasks below are part of the job of a {{ occupation }}: {{ tasks }}.
            Among the following, which task or set of tasks would be done before all other tasks in order to compelete the job?
            """),
        question_options = tasks,
        min_selections = 1,
        max_selections = 3
    )
    results1 = q1.by(m4).by(scenarios).run().to_pandas()
    first_task = results1['answer.firstTask'][0]
    first_task = ast.literal_eval(first_task) # convert from string resembling list format to actual list


    # Last task
    q2 = QuestionCheckBox(
        question_name = "lastTask",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            The tasks below are part of the job of {{ occupation }}: {{ tasks }}.
            Among the following, which task or set of tasks would be done after all other tasks are completed?
            """),
        question_options = tasks,
        min_selections = 1,
        max_selections = 3
    )
    results2 = q2.by(m4).by(scenarios).run().to_pandas()
    last_task = results2['answer.lastTask'][0]
    last_task = ast.literal_eval(last_task) # convert from string resembling list format to actual list
    
    return first_task, last_task
    

In [371]:
first_task, last_task = first_last_tasks(GPT_input_occupation, tasks)
print("First task(s):", first_task)
print("Last task(s):", last_task, "\n")

First task(s): ['Examine documents to determine degree of risk from factors such as applicant health, financial standing and value, and condition of property.']
Last task(s): ['Decline excessive risks.', 'Authorize reinsurance of policy when risk is high.'] 



In [372]:
# Compare pair of tasks
def task_relationships_firstLast_included(occupation, tasks, first_task, last_task):
    if '"Sink"' in tasks:
        tasks.remove('"Sink"')

    # Modify the first task and last task to appear as a single string
    first_task = " And ".join(first_task)
    last_task = " And ".join(last_task)

    scenarios = [Scenario({"occupation": occupation, 
                           "task_A": task_A, "task_B": task_B,
                           "first_task": first_task, "last_task": last_task}) 
                           for task_A, task_B in combinations(tasks, 2)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider {{ occupation }}.
            The first task (or set of tasks) to be completed for the job is: {{ first_task }}
            The last task (or set of tasks) to be completed for the job is: {{ last_task }}. 
            Now consider these two tasks: 
            A) {{ task_A }} 
            B) {{ task_B }}
            What is the relationship between these tasks?
            """),
        question_options = [
            "A would be done first", 
            "B would be done first",
            "Not part of the same task sequence"]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results

results = task_relationships_firstLast_included(GPT_input_occupation, tasks, first_task, last_task)
#results.select("task_A", "task_B", "ordering", "comment.ordering_comment").print()
GPT_firstLast_df_raw = results.select("task_A", "task_B", "ordering", "comment.ordering_comment").to_pandas()

# Swap columns and subset only those that are part of the same task sequence 
GPT_firstLast_df = GPT_firstLast_df_raw.copy()
mask = GPT_firstLast_df['answer.ordering'] == 'B would be done first'
GPT_firstLast_df.loc[mask, ['scenario.task_A', 'scenario.task_B']] = GPT_firstLast_df.loc[mask, ['scenario.task_B', 'scenario.task_A']].values
GPT_firstLast_df.loc[mask, 'answer.ordering'] = 'A would be done first'
GPT_firstLast_df = GPT_firstLast_df[GPT_firstLast_df['answer.ordering']=='A would be done first']
GPT_firstLast_df = GPT_firstLast_df[['scenario.task_A', 'scenario.task_B', 'comment.ordering_comment']]

# Change column names
GPT_firstLast_df = GPT_firstLast_df.rename(columns={'scenario.task_A': 'source', 
                                                    'scenario.task_B': 'target', 
                                                    'comment.ordering_comment': 'comment'})

# Save output
GPT_firstLast_df.to_csv(f'{occupation_folder}/{occupation}_firstLastTaskGPT_DAG_df.csv', index=False)

Output()

<br>

<br>

# 4) GPT Triangles/Conditioned Method df

#### Approach: Use First Last method in creating original GPT DAG. Next use the "Triangles" or "Conditioning" method for narrowing down set of edges.

<br> 

### Step 1:

#### Find all "triangles", defined as cases with:
##### A --> B --> C
##### A --> C

In [373]:
# Read output of one step GPT DAG
GPT_AM_df = pd.read_csv(f'{occupation_folder}/{occupation}_firstLastTaskGPT_DAG_df.csv')

# Convert GPT AM data frame to adjacency matrix
GPT_AM = pd.DataFrame(0, index=tasks, columns=tasks)
for index, row in GPT_AM_df.iterrows():
    GPT_AM.at[row['source'], row['target']] = 1

In [374]:
def find_triangles(matrix):
    # Ensure matrix is a numpy array
    if not isinstance(matrix, np.ndarray):
        matrix = matrix.to_numpy()
    
    # get length of matrix
    n = matrix.shape[0]

    # create list containing integers from 0 to n-1 for indexing
    numbers = list(range(n))

    # Find triangles
    triangles = []
    for x, y, z in itertools.permutations(numbers, 3):
        # get indices of destination nodes for outgoing edges of x
        out_edges_destination_x = np.where(matrix[x] == 1)[0]
        out_edges_destination_x = list(out_edges_destination_x)

        # check if x has outgoing edge to both y and z
        # if yes, check if y has outgoing edge to z
        if y in out_edges_destination_x and z in out_edges_destination_x:
            out_edges_destination_y = np.where(matrix[y] == 1)[0]
            out_edges_destination_y = list(out_edges_destination_y)
            
            # check if y has outgoing edge to z
            # if yes, we have a triangle
            if z in out_edges_destination_y:
                triangles.append([x, y, z])
    
    return triangles

# Find triangles
GPT_AM_triangles_list = find_triangles(GPT_AM)
print(f'Examples of triangles: {GPT_AM_triangles_list[:5]}')
print(f'Count of triangles: {len(GPT_AM_triangles_list)}')

Examples of triangles: [[1, 3, 0], [1, 3, 5], [1, 4, 0], [1, 4, 3], [1, 4, 5]]
Count of triangles: 29


### Step 2: 
#### Ask GPT whether conditional on having B --> C we need A --> C

In [375]:
def triangle_check(occupation, tasks, triangles_list):
    triangles = np.array(triangles_list)
    task_A_list = triangles[:, 0]
    task_B_list = triangles[:, 1]
    task_C_list = triangles[:, 2]
    scenarios = [Scenario({"occupation": occupation, "task_A": tasks[task_A], "task_B": tasks[task_B], "task_C": tasks[task_C]}) 
        for task_A, task_B, task_C in zip(task_A_list, task_B_list, task_C_list)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            And consider these three tasks: 
            A) {{ task_A }} 
            B) {{ task_B }}
            C) {{ task_C }} 
            What are the prerequisites of doing task C?
            """),
        question_options = [
            "C can be done after A without having to do B",
            "C can only be done after B",
            "These are not part of the same task sequence"]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results

results = triangle_check(GPT_input_occupation, tasks, GPT_AM_triangles_list)
#results.select("task_A", "task_B", "task_C", "ordering", "comment.ordering_comment").print()
GPT_trianglesCheck_output = results.select("task_A", "task_B", "task_C", "ordering", "comment.ordering_comment").to_pandas()
GPT_trianglesCheck_output = GPT_trianglesCheck_output.sort_values(by=['scenario.task_A', 'scenario.task_C', 'scenario.task_B']).reset_index(drop=True)

Output()

### In cases where A --> C is shared among multiple triangles, only delete when all triangles say delete

In [376]:
GPT_trianglesCheck_output = results.select("task_A", "task_B", "task_C", "ordering", "comment.ordering_comment").to_pandas()
GPT_trianglesCheck_output = GPT_trianglesCheck_output.sort_values(by=['scenario.task_A', 'scenario.task_C', 'scenario.task_B'])

# Step 1: Find the count of triangles for each A --> C pair
GPT_trianglesCheck_output['AC_pair_triangles_count'] = GPT_trianglesCheck_output.groupby(['scenario.task_A', 'scenario.task_C'])['scenario.task_A'].transform('count')


# Step 2: Find if all triangles say delete
aux_df = GPT_trianglesCheck_output.groupby(['scenario.task_A', 'scenario.task_C'])['answer.ordering'].apply(lambda x: (x == 'C can only be done after B').mean()*100).reset_index()
aux_df.columns = ['scenario.task_A', 'scenario.task_C', 'fraction_triangles_say_delete']
edges_to_remove = aux_df[aux_df['fraction_triangles_say_delete']==100]


# Step 3: Delete the rows where all triangles say delete
modified_GPT_trianglesCheck = pd.merge(GPT_trianglesCheck_output, edges_to_remove, how='left', 
                              on=['scenario.task_A', 'scenario.task_C'], 
                              indicator=True)
modified_GPT_trianglesCheck = modified_GPT_trianglesCheck[modified_GPT_trianglesCheck['_merge'] == 'left_only'].drop(columns=['_merge', 'AC_pair_triangles_count', 'fraction_triangles_say_delete'])
modified_GPT_trianglesCheck = modified_GPT_trianglesCheck.reset_index(drop=True)

#### Create a variable saying how many times each node appears as which node in a triangle
##### Purpose: find quanrangles

In [377]:
# Initialize an empty DataFrame with unique values as columns and original columns as rows
aux_df = pd.DataFrame(0, index=['scenario.task_A', 'scenario.task_B', 'scenario.task_C'], columns=tasks)

# Fill the new DataFrame with counts
for col in modified_GPT_trianglesCheck[['scenario.task_A', 'scenario.task_B', 'scenario.task_C']].columns:
    value_counts = modified_GPT_trianglesCheck[col].value_counts()
    aux_df.loc[col, value_counts.index] = value_counts.values
aux_df = aux_df.T

# Keep tasks which are sometimes node A of a triangle and sometimes node B of a triangle
#aux_df = aux_df[(aux_df > 0).all(axis=1)]
print('Nodes stats as nodes A, B, C of a triangle:')
aux_df

# get list of pivotal tasks
#pivotal_tasks = aux_df.index.tolist()

Nodes stats as nodes A, B, C of a triangle:


Unnamed: 0,scenario.task_A,scenario.task_B,scenario.task_C
Decline excessive risks.,0,0,8
"Write to field representatives, medical personnel, or others to obtain further information, quote rates, or explain company underwriting policies.",0,7,1
Evaluate possibility of losses due to catastrophe or excessive insurance.,5,5,0
"Decrease value of policy when risk is substandard and specify applicable endorsements or apply rating to ensure safe, profitable distribution of risks, using reference materials.",0,5,4
Review company records to determine amount of insurance in force on single risk or group of closely related risks.,2,3,1
Authorize reinsurance of policy when risk is high.,0,0,6
"Examine documents to determine degree of risk from factors such as applicant health, financial standing and value, and condition of property.",13,0,0


##### In cases where 
A --> B --> C and D --> A --> C 
##### the situation is different from when 
A --> B --> C and A --> D --> C
##### In such cases, edges A --> C and D --> C must be considered simultaneously as triangles are not totally "independent". 
#### So we look for "quadrangles"

In [378]:
q_AC_DC = QuestionMultipleChoice(
    question_name = "AC_DC",
    question_text = dedent("""\
        Consider {{ occupation }} as an occupation.
        And consider these tasks: {{ tasks }}.           
        As part of the steps leading up to completion of this job '{{ task_B }}' is done after '{{ task_A }}' but before '{{ task_C }}'.
        Furthermore, '{{ task_A }}' is done after '{{ task_D }}' but before '{{ task_C }}'.
        Given this structure, determine if A and D below are direct prerequisites of doing C?
        A) {{ task_A }}
        B) {{ task_B }}
        C) {{ task_C }}
        D) {{ task_D }}
        """),
    question_options = [
    "C can be done after B only after both A and D have been done earlier", # drop AC, drop DC
    "C can be done after A only after having done D first, but without having to do B", # keep AC, drop DC
    "C can be done immediately after D without having to do A or B, but it cannot be done after A without having done D or B first", # drop AC, keep DC
    "C can be done immediately after D without having to do A or B, and it can also be done after A without having done D or B", # keep AC, keep DC
    "These are not part of the same task sequence"
    ]        
)
##### Note:
##### must check these questions options. not sure if they work properly or our travel agents example is not a exhausting all possibilities

In [379]:
# Iterate over the list of tuples and subset the DataFrame
quadrangles_tasks = []
for A, B, C, D in itertools.permutations(tasks, 4):
    # Initialize an empty list to collect the indices of desired rows
    quadrangle_indices = []

    # Find rows where triangle nodes are A, B, C
    condition1 = (modified_GPT_trianglesCheck['scenario.task_A'] == A) & (modified_GPT_trianglesCheck['scenario.task_B'] == B) & (modified_GPT_trianglesCheck['scenario.task_C'] == C)
    rows1 = modified_GPT_trianglesCheck[condition1]
    
    # Find rows where triangle nodes are D, A, C
    condition2 = (modified_GPT_trianglesCheck['scenario.task_A'] == D) & (modified_GPT_trianglesCheck['scenario.task_B'] == A) & (modified_GPT_trianglesCheck['scenario.task_C'] == C)
    rows2 = modified_GPT_trianglesCheck[condition2]
    
    # If both conditions are met, add the indices to the list
    if not rows1.empty and not rows2.empty:
        quadrangles_tasks.append((A, B, C, D))    

scenarios = [Scenario({"occupation": GPT_input_occupation, "tasks": tasks,
                "task_A": A, "task_B": B, "task_C": C, "task_D": D})
                for A, B, C, D in quadrangles_tasks]
results_AC_DC = q_AC_DC.by(m4).by(scenarios).run()
#results_AC_DC.select(['answer.AC_DC', 'scenario.task_A', 'scenario.task_B', 'scenario.task_C', 'scenario.task_D', 'comment.AC_DC_comment']).print()
quadrangles_df = results_AC_DC.select(['answer.AC_DC', 'scenario.task_A', 'scenario.task_B', 'scenario.task_C', 'scenario.task_D', 'comment.AC_DC_comment']).to_pandas()

In [380]:
# decide whether to keep or drop AC and DC
quadrangles_df['keep_AC'] = quadrangles_df['answer.AC_DC'].apply(lambda x: x in ['C can be done after A only after having done D first, but without having to do B', 
                                                                                   'C can be done immediately after D without having to do A or B, and it can also be done after A without having done D or B'])
quadrangles_df['keep_DC'] = quadrangles_df['answer.AC_DC'].apply(lambda x: x in ['C can be done immediately after D without having to do A or B, but it cannot be done after A without having done D or B first', 
                                                                                   'C can be done immediately after D without having to do A or B, and it can also be done after A without having done D or B'])
quadrangles_df

Unnamed: 0,answer.AC_DC,comment.AC_DC_comment,scenario.task_A,scenario.task_B,scenario.task_C,scenario.task_D,keep_AC,keep_DC
0,C can be done after A only after having done D...,C (Decline excessive risks) can be done after ...,Evaluate possibility of losses due to catastro...,"Write to field representatives, medical person...",Decline excessive risks.,Review company records to determine amount of ...,True,False
1,C can be done after A only after having done D...,"According to the given structure, 'Decline exc...",Evaluate possibility of losses due to catastro...,"Write to field representatives, medical person...",Decline excessive risks.,Examine documents to determine degree of risk ...,True,False
2,C can be done after A only after having done D...,C (Decrease value of policy when risk is subst...,Evaluate possibility of losses due to catastro...,"Write to field representatives, medical person...",Decrease value of policy when risk is substand...,Examine documents to determine degree of risk ...,True,False
3,C can be done after A only after having done D...,C (Authorize reinsurance of policy when risk i...,Evaluate possibility of losses due to catastro...,"Write to field representatives, medical person...",Authorize reinsurance of policy when risk is h...,Examine documents to determine degree of risk ...,True,False
4,C can be done after A only after having done D...,C (Decline excessive risks) can be done after ...,Evaluate possibility of losses due to catastro...,Decrease value of policy when risk is substand...,Decline excessive risks.,Review company records to determine amount of ...,True,False
5,C can be done after A only after having done D...,"Based on the given structure, 'Decline excessi...",Evaluate possibility of losses due to catastro...,Decrease value of policy when risk is substand...,Decline excessive risks.,Examine documents to determine degree of risk ...,True,False
6,C can be done after B only after both A and D ...,C (Authorize reinsurance of policy when risk i...,Evaluate possibility of losses due to catastro...,Decrease value of policy when risk is substand...,Authorize reinsurance of policy when risk is h...,Examine documents to determine degree of risk ...,False,False
7,C can be done after B only after both A and D ...,C (Decline excessive risks) can be done after ...,Review company records to determine amount of ...,Evaluate possibility of losses due to catastro...,Decline excessive risks.,Examine documents to determine degree of risk ...,False,False
8,C can be done after B only after both A and D ...,C (Decline excessive risks) can only be done a...,Review company records to determine amount of ...,Decrease value of policy when risk is substand...,Decline excessive risks.,Examine documents to determine degree of risk ...,False,False


#### Drop extra AC and DC edges

In [381]:
def remove_duplicates(input_list):
    seen = set()
    unique_list = []
    for item in input_list:
        if item not in seen:
            unique_list.append(item)
            seen.add(item)
    return unique_list

# Step 1: Get list of unique edges found in all quadrangles
pairs_AC = list(zip(quadrangles_df["scenario.task_A"], quadrangles_df["scenario.task_C"]))
pairs_DC = list(zip(quadrangles_df["scenario.task_D"], quadrangles_df["scenario.task_C"]))
all_pairs = pairs_AC + pairs_DC
ACDC_edges_list = remove_duplicates(all_pairs)


# Step 2: Get list of edges to keep
aux_df = quadrangles_df[quadrangles_df['keep_AC']==True]
pairs_AC_toKeep = list(zip(aux_df["scenario.task_A"], aux_df["scenario.task_C"]))
aux_df = quadrangles_df[quadrangles_df['keep_DC']==True]
pairs_DC_toKeep = list(zip(aux_df["scenario.task_D"], aux_df["scenario.task_C"]))
all_pairs_toKeep = pairs_AC_toKeep + pairs_DC_toKeep
ACDC_edges_toKeep_list = remove_duplicates(all_pairs_toKeep)


# Step 3: Get list of edges to drop
ACDC_edges_toDrop_list = [item for item in ACDC_edges_list if item not in ACDC_edges_toKeep_list]


# Create a DataFrame of edges to be dropped from this analysis and earlier analyses
ACDC_edges_to_remove = pd.DataFrame(ACDC_edges_toDrop_list, columns=["scenario.task_A", "scenario.task_C"])
edges_to_remove = pd.concat([edges_to_remove, ACDC_edges_to_remove], ignore_index=True)

In [382]:
# Remove redundant edges
modified_GPT_AM_df = GPT_AM_df.copy()
modified_GPT_AM_df = pd.merge(modified_GPT_AM_df, edges_to_remove, how='left', 
                              left_on=['source', 'target'], right_on=['scenario.task_A', 'scenario.task_C'], 
                              indicator=True)
modified_GPT_AM_df = modified_GPT_AM_df[modified_GPT_AM_df['_merge'] == 'left_only'].drop(columns=['_merge', 'scenario.task_A', 'scenario.task_C'])
modified_GPT_AM_df = modified_GPT_AM_df.reset_index(drop=True)

# Save output
modified_GPT_AM_df.to_csv(f'{occupation_folder}/{occupation}_conditionedGPT_DAG_df.csv', index=False)

<br>

<br>

# 5) GPT Task Partitioning Method df

### Start w/ breaking down the DAG into multiple minimally-connected subgraphs

In [383]:
# Compare pair of tasks
def partition_tasks(occupation, tasks):
    if '"Sink"' in tasks:
        tasks.remove('"Sink"')

    scenarios = [Scenario({"occupation": occupation, "tasks": tasks})]

    q = QuestionFreeText(
        question_name = "partition",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            And consider these tasks: {{ tasks }}.
            Can these tasks be partitioned into separate, minimally connected groups of tasks?
            If so, give the number of groups and list tasks in each group. 
            Avoid using \n in the answer, and list groups in the following format: Group x: ['task1', 'task2', 'task3'].
            """)
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results

results = partition_tasks(GPT_input_occupation, tasks)
#results.print()
partition_tasks_output_str = results.select("answer.partition").to_pandas().iloc[0,0]

Output()

#### Group tasks into smaller partitions

In [384]:
# Find all "Group x" occurrences in LLM output
groups = re.findall(r'Group \d+', partition_tasks_output_str)

# Split the text at each "Group x"
parts = re.split(r'(Group \d+:)', partition_tasks_output_str)

# Initialize a dictionary to hold the group texts
partitions_dict = {}

# Iterate through the parts and store the texts in the dictionary
for i in range(1, len(parts), 2):
    group_name = parts[i].strip(': ')
    group_number = int(re.search(r'\d+', group_name).group())
    group_text = parts[i+1].strip().rstrip('.,')
    
    # Convert the string representation of the list to an actual list
    partitions_dict[group_number] = group_text

# Output the dictionary
partitions_dict

{1: "['Decline excessive risks.', 'Evaluate possibility of losses due to catastrophe or excessive insurance.', 'Decrease value of policy when risk is substandard and specify applicable endorsements or apply rating to ensure safe, profitable distribution of risks, using reference materials.', 'Authorize reinsurance of policy when risk is high.']",
 2: "['Write to field representatives, medical personnel, or others to obtain further information, quote rates, or explain company underwriting policies.', 'Review company records to determine amount of insurance in force on single risk or group of closely related risks.', 'Examine documents to determine degree of risk from factors such as applicant health, financial standing and value, and condition of property.']"}

#### Determine relation of partitions

In [385]:
# Compare pair of tasks
def partition_relationships(occupation, partitions_dict):
    scenarios = [Scenario({"occupation": occupation, "partition_A": A, "partition_B": B}) 
        for A, B in itertools.combinations(partitions_dict.values(), 2)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            And consider these two partitions of tasks: 
            A) {{ partition_A }} 
            B) {{ partition_B }}
            What is the relationship between these groups of tasks?
            """),
        question_options = [
            "Tasks of partition A would be done first", 
            "Tasks of partition B would be done first",
            "Could be done in either order, but still part of the same sequence",
            "Not part of the same task sequence"]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results

results = partition_relationships(GPT_input_occupation, partitions_dict)
#results.print()
partitions_ordering_df = results.select("partition_A", "partition_B", "ordering", "ordering_comment").to_pandas()

Output()

In [386]:
# Swap columns so that all partitions in first column are done earlier
mask = partitions_ordering_df['answer.ordering'] == 'Tasks of partition B would be done first'
partitions_ordering_df.loc[mask, ['scenario.partition_A', 'scenario.partition_B']] = partitions_ordering_df.loc[mask, ['scenario.partition_B', 'scenario.partition_A']].values
partitions_ordering_df.loc[mask, 'answer.ordering'] = 'Tasks of partition A would be done first'
partitions_ordering_df = partitions_ordering_df[partitions_ordering_df['answer.ordering']=='Tasks of partition A would be done first']

In [387]:
# Add group numbers to data frame
aux_dict = {v: k for k, v in partitions_dict.items()}
partitions_ordering_df['partition_A_groupNum'] = partitions_ordering_df['scenario.partition_A'].map(aux_dict)
partitions_ordering_df['partition_B_groupNum'] = partitions_ordering_df['scenario.partition_B'].map(aux_dict)
partitions_ordering_df

Unnamed: 0,answer.ordering,comment.ordering_comment,scenario.partition_A,scenario.partition_B,partition_A_groupNum,partition_B_groupNum
0,Tasks of partition A would be done first,Tasks of partition B involve gathering and rev...,"['Write to field representatives, medical pers...","['Decline excessive risks.', 'Evaluate possibi...",2,1


In [388]:
# Compare pair of tasks within each partition
def task_relationships_within_partition(occupation, tasks):
    if '"Sink"' in tasks:
        tasks.remove('"Sink"')

    scenarios = [Scenario({"occupation": occupation, "task_A": task_A, "task_B": task_B}) 
        for task_A, task_B in combinations(tasks, 2)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            And consider these two tasks: 
            A) {{ task_A }} 
            B) {{ task_B }}
            What is the relationship between these tasks?
            """),
        question_options = [
            "A would be done first", 
            "B would be done first",
            "Not part of the same task sequence"]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results


# Function to handle apastrophes and commas in the list string
def clean_list_string(s):
    # Escape the apostrophe in specific problematic cases
    s = re.sub(r"(?<!\\)'s costs", r"\\'s costs", s)
    return s


task_relationships_within_partition_df = pd.DataFrame()
for key, value in partitions_dict.items():
    # Get list of tasks in the partition
    my_partition_tasks = clean_list_string(value)
    my_partition_tasks = ast.literal_eval(my_partition_tasks)
    if len(my_partition_tasks) < 2:
        continue

    # Run the function
    results = task_relationships_within_partition(GPT_input_occupation, my_partition_tasks)
    aux_df = results.select("task_A", "task_B", "ordering", "comment.ordering_comment").to_pandas()
    aux_df['partition'] = key

    # Add to data frame
    task_relationships_within_partition_df = pd.concat([task_relationships_within_partition_df, aux_df], ignore_index=True)

Output()

Output()

In [389]:
# Compare pair of tasks within each partition
def task_relationships_between_partitions(occupation, tasks_partition1, tasks_partition2):
    if '"Sink"' in tasks_partition1:
        tasks_partition1.remove('"Sink"')
    if '"Sink"' in tasks_partition2:
        tasks_partition2.remove('"Sink"')
    

    scenarios = [Scenario({"occupation": occupation, 
                           "tasks_partition1": tasks_partition1, "tasks_partition2": tasks_partition2,
                           "task_A": task_A, "task_B": task_B}) 
        for task_A, task_B in itertools.product(tasks_partition1, tasks_partition2)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            And consider these two partitions of tasks; partition 1: {{ tasks_partition1 }} and partition 2: {{ tasks_partition2 }}.
            We know that tasks in partition 1 would be done before tasks in partition 2.
            Now consider these two tasks:
            A) {{ task_A }} 
            B) {{ task_B }}
            Knowing that task A is from partition 1 and task B is from partition 2, what is the relationship between these tasks?
            """),
        question_options = [
            "A must be done in order to do B", 
            "A is not required for doing B",
            ]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results


task_relationships_between_partitions_df = pd.DataFrame()
for (key1, value1), (key2, value2) in itertools.combinations(partitions_dict.items(), 2):
    # determine which partition is done first
    if len(partitions_ordering_df[(partitions_ordering_df['partition_A_groupNum'] == key1) & (partitions_ordering_df['partition_B_groupNum'] == key2)]) > 0:
        first_partition = key1
        second_partition = key2
    elif len(partitions_ordering_df[(partitions_ordering_df['partition_A_groupNum'] == key2) & (partitions_ordering_df['partition_B_groupNum'] == key1)]) > 0:
        first_partition = key2
        second_partition = key1
    else:
        continue

    # Get list of tasks in the partition
    tasks_partition1 = ast.literal_eval(clean_list_string(value1))
    tasks_partition2 = ast.literal_eval(clean_list_string(value2))
    
    # Run the function
    results = task_relationships_between_partitions(GPT_input_occupation, tasks_partition1, tasks_partition2)
    aux_df = results.select("task_A", "task_B", "ordering", "comment.ordering_comment").to_pandas()
    
    # Add to data frame
    task_relationships_between_partitions_df = pd.concat([task_relationships_between_partitions_df, aux_df], ignore_index=True)

Output()

In [390]:
# Get edges from within and between partitions data frames
between_edges = task_relationships_between_partitions_df[task_relationships_between_partitions_df['answer.ordering'] == 'A must be done in order to do B']
between_edges = between_edges[['scenario.task_A', 'scenario.task_B', 'comment.ordering_comment']]
within_edges = task_relationships_within_partition_df[['scenario.task_A', 'scenario.task_B', 'comment.ordering_comment']]

# Combine edges from within and between partitions
partitions_DAG_df = pd.concat([within_edges, between_edges], ignore_index=True)

# Change column names
partitions_DAG_df = partitions_DAG_df.rename(columns={'scenario.task_A': 'source', 
                                                    'scenario.task_B': 'target', 
                                                    'comment.ordering_comment': 'comment'})

# Save output
partitions_DAG_df.to_csv(f'{occupation_folder}/{occupation}_partitionedGPT_DAG_df.csv', index=False)