# Load libraries and functions

In [27]:
with open('libraries.py') as f:
    code = f.read()
exec(code)

with open('functions.py') as f:
    code = f.read()
exec(code)

In [28]:
# determine user
user = getpass.getuser()
if user == 'peymansh':
    main_folder_path = '/Users/peymansh/Dropbox (MIT)/Research/AI and Occupations/ai-exposure'
    data_path = f'{main_folder_path}/output'

## Main Code

In [29]:
# Pick occupation and initialize variables
occupation = 'travelAgents'
#occupation = 'insuranceUnderwriters'

GPT_input_occupation, plot_title_occupation, occupation_code, occupation_folder = pick_occupation(occupation)

In [30]:
# Load the data
onet = pd.read_csv(f'{data_path}/data/onet_occupations_yearly.csv')
onet = onet.sort_values(by=['year', 'occ_code', 'occ_title', 'task_id'])
onet = onet[onet['year'] == 2023].reset_index(drop=True)

# Get list of tasks
my_df = onet[(onet.occ_code == f'{occupation_code}') & (onet.year == 2023)]
tasks = my_df['task'].unique().tolist()

<br>

<br>

# Manual DAG df

In [31]:
# Read manual adjacency matrix
manual_AM = pd.read_csv(f'{occupation_folder}/{occupation}_AM.csv', index_col=0)
manual_AM = add_sink_node(manual_AM, occupation)

# Initialize lists to store the source and target nodes
sources = []
targets = []

# Iterate over the adjacency matrix to find ones and populate the lists
for row_label, row in manual_AM.iterrows():
    for col_label, value in row.items():
        if value == 1:
            sources.append(row_label)
            targets.append(col_label)

# Create data frame
manual_DAG_df = pd.DataFrame({'source': sources, 'target': targets})

# Save output
manual_DAG_df.to_csv(f'{occupation_folder}/{occupation}_manual_DAG_df.csv', index=False)

<br>

<br>

# GPT DAG df

## One Step Method: Directly ask for pairwise comparison w/o giving the "either" option

In [32]:
# Compare pair of tasks
def task_relationships(occupation, tasks):
    if '"Sink"' in tasks:
        tasks.remove('"Sink"')

    scenarios = [Scenario({"occupation": occupation, "task_A": task_A, "task_B": task_B}) 
        for task_A, task_B in combinations(tasks, 2)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            And consider these two tasks: 
            A) {{ task_A }} 
            B) {{ task_B }}
            What is the relationship between these tasks?
            """),
        question_options = [
            "A would be done first", 
            "B would be done first",
            "Not part of the same task sequence"]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results

results = task_relationships(GPT_input_occupation, tasks)
#results.select("task_A", "task_B", "ordering", "comment.ordering_comment").print()
pairwise_relationships_wo_raw = results.select("task_A", "task_B", "ordering", "comment.ordering_comment").to_pandas()

Output()

In [33]:
# Swap columns and subset only those that are part of the same task sequence 
pairwise_relationships_wo = pairwise_relationships_wo_raw.copy()
mask = pairwise_relationships_wo['answer.ordering'] == 'B would be done first'
pairwise_relationships_wo.loc[mask, ['scenario.task_A', 'scenario.task_B']] = pairwise_relationships_wo.loc[mask, ['scenario.task_B', 'scenario.task_A']].values
pairwise_relationships_wo.loc[mask, 'answer.ordering'] = 'A would be done first'
pairwise_relationships_wo = pairwise_relationships_wo[pairwise_relationships_wo['answer.ordering']=='A would be done first']
pairwise_relationships_wo = pairwise_relationships_wo[['scenario.task_A', 'scenario.task_B', 'comment.ordering_comment']]

# Change column names
pairwise_relationships_wo = pairwise_relationships_wo.rename(columns={'scenario.task_A': 'source', 
                                                                      'scenario.task_B': 'target', 
                                                                      'comment.ordering_comment': 'comment'})

# Save output
pairwise_relationships_wo.to_csv(f'{occupation_folder}/{occupation}_oneStepGPT_DAG_df.csv', index=False)

## Two Steps Method: Give option of "either" and then filter symmetric edges
### Step 1:

In [34]:
# Compare pair of tasks
def task_relationships(occupation, tasks):
    if '"Sink"' in tasks:
        tasks.remove('"Sink"')

    scenarios = [Scenario({"occupation": occupation, "task_A": task_A, "task_B": task_B}) 
        for task_A, task_B in combinations(tasks, 2)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            And consider these two tasks: 
            A) {{ task_A }} 
            B) {{ task_B }}
            What is the relationship between these tasks?
            """),
        question_options = [
            "A would be done first", 
            "B would be done first",
            "Could be done in either order, but still part of the same sequence",
            "Not part of the same task sequence"]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results

results = task_relationships(GPT_input_occupation, tasks)
#results.select("task_A", "task_B", "ordering", "comment.ordering_comment").print()
pairwise_relationships_w_raw = results.select("task_A", "task_B", "ordering", "comment.ordering_comment").to_pandas()

Output()

### Step 2:

In [35]:
# subset symmetric edges
both_edges = pairwise_relationships_w_raw[pairwise_relationships_w_raw['answer.ordering'] == 'Could be done in either order, but still part of the same sequence']
task_A_list = both_edges['scenario.task_A'].tolist()
task_B_list = both_edges['scenario.task_B'].tolist()


# Decide which one of symmetric edges to keep
def pick_oneOf_symmetricEdges(occupation, task_A_list, task_B_list):
    scenarios = [Scenario({"occupation": occupation, "task_A": task_A, "task_B": task_B}) 
        for task_A, task_B in zip(task_A_list, task_B_list)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            And consider these two tasks: 
            A) {{ task_A }} 
            B) {{ task_B }}
            What is the relationship between these tasks?
            """),
        question_options = [
            "A would be done first", 
            "B would be done first",]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results

results = pick_oneOf_symmetricEdges(GPT_input_occupation, task_A_list, task_B_list)
#results.select("task_A", "task_B", "ordering", "comment.ordering_comment").print()
which_symmetric_edge = results.select("task_A", "task_B", "ordering", "comment.ordering_comment").to_pandas()

Output()

In [36]:
# Merge datasets
pairwise_relationships_w = pairwise_relationships_w_raw[pairwise_relationships_w_raw['answer.ordering'].isin(['A would be done first', 'B would be done first'])]
pairwise_relationships_w = pd.concat([pairwise_relationships_w, which_symmetric_edge], ignore_index=True)

# Swap columns
mask = pairwise_relationships_w['answer.ordering'] == 'B would be done first'
pairwise_relationships_w.loc[mask, ['scenario.task_A', 'scenario.task_B']] = pairwise_relationships_w.loc[mask, ['scenario.task_B', 'scenario.task_A']].values
pairwise_relationships_w.loc[mask, 'answer.ordering'] = 'A would be done first'
pairwise_relationships_w = pairwise_relationships_w[pairwise_relationships_w['answer.ordering']=='A would be done first']
pairwise_relationships_w = pairwise_relationships_w[['scenario.task_A', 'scenario.task_B', 'comment.ordering_comment']]

# Change column names
pairwise_relationships_w = pairwise_relationships_w.rename(columns={'scenario.task_A': 'source', 
                                                                    'scenario.task_B': 'target', 
                                                                    'comment.ordering_comment': 'comment'})

# Save output
pairwise_relationships_w.to_csv(f'{occupation_folder}/{occupation}_twoStepGPT_DAG_df.csv', index=False)

<br>

<br>

# GPT First Last Task Method df

### Use One Step Method: Directly ask for pairwise comparison w/o giving the "either" option
### Next determine first and last task/tasks to be done in the sequence and ask GPT to produce DAG

In [37]:
import ast # for converting outputs to a list

def first_last_tasks(occupation, tasks):
    # Remove "Sink" node if it exists
    if '"Sink"' in tasks:
        tasks.remove('"Sink"')

    scenarios = [Scenario({"occupation": GPT_input_occupation, "tasks": tasks})]

    # First task
    q1 = QuestionCheckBox(
        question_name = "firstTask",
        question_text = dedent("""\
            Consider {{ occupation }}.
            The tasks below are part of the job of a {{ occupation }}: {{ tasks }}.
            Among the following, which task or set of tasks would be done before all other tasks in order to compelete the job?
            """),
        question_options = tasks,
        min_selections = 1,
        max_selections = 3
    )
    results1 = q1.by(m4).by(scenarios).run().to_pandas()
    first_task = results1['answer.firstTask'][0]
    first_task = ast.literal_eval(first_task) # convert from string resembling list format to actual list


    # Last task
    q2 = QuestionCheckBox(
        question_name = "lastTask",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            The tasks below are part of the job of {{ occupation }}: {{ tasks }}.
            Among the following, which task or set of tasks would be done after all other tasks are completed?
            """),
        question_options = tasks,
        min_selections = 1,
        max_selections = 3
    )
    results2 = q2.by(m4).by(scenarios).run().to_pandas()
    last_task = results2['answer.lastTask'][0]
    last_task = ast.literal_eval(last_task) # convert from string resembling list format to actual list
    
    return first_task, last_task
    

In [38]:
first_task, last_task = first_last_tasks(GPT_input_occupation, tasks)
print("First task(s):", first_task)
print("Last task(s):", last_task, "\n")

First task(s): ['Examine documents to determine degree of risk from factors such as applicant health, financial standing and value, and condition of property.', 'Write to field representatives, medical personnel, or others to obtain further information, quote rates, or explain company underwriting policies.', 'Review company records to determine amount of insurance in force on single risk or group of closely related risks.']
Last task(s): ['Decline excessive risks.', 'Authorize reinsurance of policy when risk is high.'] 



In [39]:
# Compare pair of tasks
def task_relationships_firstLast_included(occupation, tasks, first_task, last_task):
    if '"Sink"' in tasks:
        tasks.remove('"Sink"')

    # Modify the first task and last task to appear as a single string
    first_task = " And ".join(first_task)
    last_task = " And ".join(last_task)

    scenarios = [Scenario({"occupation": occupation, 
                           "task_A": task_A, "task_B": task_B,
                           "first_task": first_task, "last_task": last_task}) 
                           for task_A, task_B in combinations(tasks, 2)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider {{ occupation }}.
            The first task (or set of tasks) to be completed for the job is: {{ first_task }}
            The last task (or set of tasks) to be completed for the job is: {{ last_task }}. 
            Now consider these two tasks: 
            A) {{ task_A }} 
            B) {{ task_B }}
            What is the relationship between these tasks?
            """),
        question_options = [
            "A would be done first", 
            "B would be done first",
            "Not part of the same task sequence"]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results

results = task_relationships_firstLast_included(GPT_input_occupation, tasks, first_task, last_task)
#results.select("task_A", "task_B", "ordering", "comment.ordering_comment").print()
GPT_firstLast_df_raw = results.select("task_A", "task_B", "ordering", "comment.ordering_comment").to_pandas()

# Swap columns and subset only those that are part of the same task sequence 
GPT_firstLast_df = GPT_firstLast_df_raw.copy()
mask = GPT_firstLast_df['answer.ordering'] == 'B would be done first'
GPT_firstLast_df.loc[mask, ['scenario.task_A', 'scenario.task_B']] = GPT_firstLast_df.loc[mask, ['scenario.task_B', 'scenario.task_A']].values
GPT_firstLast_df.loc[mask, 'answer.ordering'] = 'A would be done first'
GPT_firstLast_df = GPT_firstLast_df[GPT_firstLast_df['answer.ordering']=='A would be done first']
GPT_firstLast_df = GPT_firstLast_df[['scenario.task_A', 'scenario.task_B', 'comment.ordering_comment']]

# Change column names
GPT_firstLast_df = GPT_firstLast_df.rename(columns={'scenario.task_A': 'source', 
                                                    'scenario.task_B': 'target', 
                                                    'comment.ordering_comment': 'comment'})

# Save output
GPT_firstLast_df.to_csv(f'{occupation_folder}/{occupation}_firstLastTaskGPT_DAG_df.csv', index=False)

Output()

<br>

<br>

# GPT Triangles/Conditioning Method df

#### Approach: Use First Last method in creating original GPT DAG. Next use the "Triangles" or "Conditioning" method for narrowing down set of edges.

<br> 

### Step 1:

#### Find all "triangles", defined as cases with:
##### A --> B --> C
##### A --> C

In [40]:
# Read output of one step GPT DAG
GPT_AM_df = pd.read_csv(f'{occupation_folder}/{occupation}_firstLastTaskGPT_DAG_df.csv')

# Convert GPT AM data frame to adjacency matrix
GPT_AM = pd.DataFrame(0, index=tasks, columns=tasks)
for index, row in GPT_AM_df.iterrows():
    GPT_AM.at[row['source'], row['target']] = 1

In [41]:
def find_triangles(matrix):
    # Ensure matrix is a numpy array
    if not isinstance(matrix, np.ndarray):
        matrix = matrix.to_numpy()
    
    # get length of matrix
    n = matrix.shape[0]

    # create list containing integers from 0 to n-1 for indexing
    numbers = list(range(n))

    # Find triangles
    triangles = []
    for x, y, z in itertools.permutations(numbers, 3):
        # get indices of destination nodes for outgoing edges of x
        out_edges_destination_x = np.where(matrix[x] == 1)[0]
        out_edges_destination_x = list(out_edges_destination_x)

        # check if x has outgoing edge to both y and z
        # if yes, check if y has outgoing edge to z
        if y in out_edges_destination_x and z in out_edges_destination_x:
            out_edges_destination_y = np.where(matrix[y] == 1)[0]
            out_edges_destination_y = list(out_edges_destination_y)
            
            # check if y has outgoing edge to z
            # if yes, we have a triangle
            if z in out_edges_destination_y:
                triangles.append([x, y, z])
    
    return triangles

# Find triangles
GPT_AM_triangles_list = find_triangles(GPT_AM)
print(f'Examples of triangles: {GPT_AM_triangles_list[:5]}')
print(f'Count of triangles: {len(GPT_AM_triangles_list)}')

Examples of triangles: [[1, 2, 0], [1, 2, 3], [1, 2, 5], [1, 3, 0], [1, 3, 5]]
Count of triangles: 30


### Step 2: 
#### Ask GPT whether conditional on having B --> C we need A --> C

In [42]:
def triangle_check(occupation, tasks, triangles_list):
    triangles = np.array(triangles_list)
    task_A_list = triangles[:, 0]
    task_B_list = triangles[:, 1]
    task_C_list = triangles[:, 2]
    scenarios = [Scenario({"occupation": occupation, "task_A": tasks[task_A], "task_B": tasks[task_B], "task_C": tasks[task_C]}) 
        for task_A, task_B, task_C in zip(task_A_list, task_B_list, task_C_list)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider {{ occupation }}. 
            And consider these three tasks: 
            A) {{ task_A }} 
            B) {{ task_B }}
            C) {{ task_C }} 
            What are the prerequisites of doing task C?
            """),
        question_options = [
            "C can be done after A without having to do B",
            "C can only be done after B",
            "These are not part of the same task sequence"]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results

results = triangle_check(GPT_input_occupation, tasks, GPT_AM_triangles_list)
#results.select("task_A", "task_B", "task_C", "ordering", "comment.ordering_comment").print()
GPT_trianglesCheck_output = results.select("task_A", "task_B", "task_C", "ordering", "comment.ordering_comment").to_pandas()
GPT_trianglesCheck_output = GPT_trianglesCheck_output.sort_values(by=['scenario.task_A', 'scenario.task_C', 'scenario.task_B']).reset_index(drop=True)

Output()

### In cases where A --> C is shared among multiple triangles, only delete when all triangles say delete (lowers computation load for later cleanings)

In [43]:
GPT_trianglesCheck_output = results.select("task_A", "task_B", "task_C", "ordering", "comment.ordering_comment").to_pandas()
GPT_trianglesCheck_output = GPT_trianglesCheck_output.sort_values(by=['scenario.task_A', 'scenario.task_C', 'scenario.task_B'])

# Step 1: Find the count of triangles for each A --> C pair
GPT_trianglesCheck_output['AC_pair_triangles_count'] = GPT_trianglesCheck_output.groupby(['scenario.task_A', 'scenario.task_C'])['scenario.task_A'].transform('count')


# Step 2: Find if all triangles say delete
aux_df = GPT_trianglesCheck_output.groupby(['scenario.task_A', 'scenario.task_C'])['answer.ordering'].apply(lambda x: (x == 'C can only be done after B').all()).reset_index()

# Rename the resulting columns for better readability
aux_df.columns = ['scenario.task_A', 'scenario.task_C', 'delete_AC_pair']
aux_df = aux_df[aux_df['delete_AC_pair']==True]
edges_to_remove = aux_df[['scenario.task_A', 'scenario.task_C']].copy()


# Step 3: Delete the rows where all triangles say delete
modified_GPT_trianglesCheck_output = pd.merge(GPT_trianglesCheck_output, aux_df, how='left', 
                              on=['scenario.task_A', 'scenario.task_C'], 
                              indicator=True)
modified_GPT_trianglesCheck_output = modified_GPT_trianglesCheck_output[modified_GPT_trianglesCheck_output['_merge'] == 'left_only'].drop(columns=['_merge', 'AC_pair_triangles_count', 'delete_AC_pair'])

### Construct a variable saying how many times each node appears in triangles.
### Purpose: find quanrangles

In [44]:
# Initialize an empty DataFrame with unique values as columns and original columns as rows
aux_df = pd.DataFrame(0, index=['scenario.task_A', 'scenario.task_B', 'scenario.task_C'], columns=tasks)

# Fill the new DataFrame with counts
for col in modified_GPT_trianglesCheck_output[['scenario.task_A', 'scenario.task_B', 'scenario.task_C']].columns:
    value_counts = modified_GPT_trianglesCheck_output[col].value_counts()
    aux_df.loc[col, value_counts.index] = value_counts.values
aux_df = aux_df.T

# Keep tasks which are sometimes node A of a triangle and sometimes node B of a triangle
aux_df = aux_df[(aux_df > 0).all(axis=1)]

# get list of pivotal tasks
pivotal_tasks = aux_df.index.tolist()
aux_df

Unnamed: 0,scenario.task_A,scenario.task_B,scenario.task_C
Evaluate possibility of losses due to catastrophe or excessive insurance.,2,6,3


In [45]:
q_AC_DC = QuestionMultipleChoice(
    question_name = "AC_DC",
    question_text = dedent("""\
        Consider {{ occupation }} as an occupation.
        And consider these tasks: {{ tasks }}.           
        As part of the steps leading up to completion of this job '{{ task_B }}' is done after '{{ task_A }}' but before '{{ task_C }}'.
        Furthermore, '{{ task_A }}' is done after '{{ task_D }}' but before '{{ task_C }}'.
        Given this structure, determine if A and D below are direct prerequisites of doing C?
        A) {{ task_A }}
        B) {{ task_B }}
        C) {{ task_C }}
        D) {{ task_D }}
        """),
    question_options = [
    "C can be done after B only after both A and D have been done earlier", # drop AC, drop DC
    "C can be done after A only after having done D first, but without having to do B", # keep AC, drop DC
    "C can be done immediately after D without having to do A or B, but it cannot be done after A without having done D or B first", # drop AC, keep DC
    "C can be done immediately after D without having to do A or B, and it can also be done after A without having done D or B", # keep AC, keep DC
    "These are not part of the same task sequence"
    ]        
)
#results_AC_DC = q_AC_DC.by(m4).by(scenarios).run()
#results_AC_DC.print()

##### Note:
##### must check these questions options. not sure if they work properly or our travel agents example is not a exhausting all possibilities

In [46]:
# Iterate over the list of tuples and subset the DataFrame
identifier = 1
quadrangles_tasks = []
quadrangles_df = pd.DataFrame()
for A, B, C, D in itertools.permutations(tasks, 4):
    # Initialize an empty list to collect the indices of desired rows
    quadrangle_indices = []

    # Find rows where triangle nodes are A, B, C
    condition1 = (modified_GPT_trianglesCheck_output['scenario.task_A'] == A) & (modified_GPT_trianglesCheck_output['scenario.task_B'] == B) & (modified_GPT_trianglesCheck_output['scenario.task_C'] == C)
    rows1 = modified_GPT_trianglesCheck_output[condition1]
    
    # Find rows where triangle nodes are D, A, C
    condition2 = (modified_GPT_trianglesCheck_output['scenario.task_A'] == D) & (modified_GPT_trianglesCheck_output['scenario.task_B'] == A) & (modified_GPT_trianglesCheck_output['scenario.task_C'] == C)
    rows2 = modified_GPT_trianglesCheck_output[condition2]
    
    # If both conditions are met, add the indices to the list
    if not rows1.empty and not rows2.empty:
        #print(f'Quadrangle {identifier}: A={A[:10]}, B={B[:10]}, C={C[:10]}, D={D[:10]}')
        quadrangles_tasks.append((A, B, C, D))

        # quadrangle_indices.extend(rows1.index)
        # quadrangle_indices.extend(rows2.index)

        # # Create a new DataFrame using the collected indices
        # aux_df = modified_GPT_trianglesCheck_output[['scenario.task_A', 'scenario.task_B', 'scenario.task_C']].loc[quadrangle_indices].drop_duplicates()
        # aux_df['identifier'] = identifier
        # quadrangles_df = pd.concat([quadrangles_df, aux_df])
        # identifier += 1
    

scenarios = [Scenario({"occupation": GPT_input_occupation, "tasks": tasks,
                "task_A": A, "task_B": B, "task_C": C, "task_D": D})
                for A, B, C, D in quadrangles_tasks]
results_AC_DC = q_AC_DC.by(m4).by(scenarios).run()
#results_AC_DC.select(['answer.AC_DC', 'scenario.task_A', 'scenario.task_B', 'scenario.task_C', 'scenario.task_D', 'comment.AC_DC_comment']).print()
quadrangles_df = results_AC_DC.select(['answer.AC_DC', 'scenario.task_A', 'scenario.task_B', 'scenario.task_C', 'scenario.task_D', 'comment.AC_DC_comment']).to_pandas()

# quadrangles_df = quadrangles_df.reset_index(drop=True)
# quadrangles_df


In [47]:
# decide whether to keep or drop AC and DC
quadrangles_df['keep_AC'] = quadrangles_df['answer.AC_DC'].apply(lambda x: x in ['C can be done after A only after having done D first, but without having to do B', 
                                                                                   'C can be done immediately after D without having to do A or B, and it can also be done after A without having done D or B'])
quadrangles_df['keep_DC'] = quadrangles_df['answer.AC_DC'].apply(lambda x: x in ['C can be done immediately after D without having to do A or B, but it cannot be done after A without having done D or B first', 
                                                                                   'C can be done immediately after D without having to do A or B, and it can also be done after A without having done D or B'])

In [48]:
def remove_duplicates(input_list):
    seen = set()
    unique_list = []
    for item in input_list:
        if item not in seen:
            unique_list.append(item)
            seen.add(item)
    return unique_list

# Get list of unique edges involved in quadrangles
pairs_AC = list(zip(quadrangles_df["scenario.task_A"], quadrangles_df["scenario.task_C"]))
pairs_DC = list(zip(quadrangles_df["scenario.task_D"], quadrangles_df["scenario.task_C"]))
all_pairs = pairs_AC + pairs_DC
ACDC_edges_list = remove_duplicates(all_pairs)


# Get list of edges to keep
aux_df = quadrangles_df[quadrangles_df['keep_AC']==True]
pairs_AC_toKeep = list(zip(aux_df["scenario.task_A"], aux_df["scenario.task_C"]))
aux_df = quadrangles_df[quadrangles_df['keep_DC']==True]
pairs_DC_toKeep = list(zip(aux_df["scenario.task_D"], aux_df["scenario.task_C"]))
all_pairs_toKeep = pairs_AC_toKeep + pairs_DC_toKeep
ACDC_edges_toKeep_list = remove_duplicates(all_pairs_toKeep)


# Get list of edges to drop
ACDC_edges_toDrop_list = [item for item in ACDC_edges_list if item not in ACDC_edges_toKeep_list]


# Create a new DataFrame with the edges to drop
ACDC_edges_to_remove = pd.DataFrame(ACDC_edges_toDrop_list, columns=["scenario.task_A", "scenario.task_C"])

In [49]:
# Remove redundant edges
edges_to_remove = pd.concat([edges_to_remove, ACDC_edges_to_remove], ignore_index=True)

modified_GPT_AM_df = GPT_AM_df.copy()
modified_GPT_AM_df = pd.merge(modified_GPT_AM_df, edges_to_remove, how='left', 
                              left_on=['source', 'target'], right_on=['scenario.task_A', 'scenario.task_C'], 
                              indicator=True)
modified_GPT_AM_df = modified_GPT_AM_df[modified_GPT_AM_df['_merge'] == 'left_only'].drop(columns=['_merge', 'scenario.task_A', 'scenario.task_C'])
modified_GPT_AM_df = modified_GPT_AM_df.reset_index(drop=True)

### Issue w/ deciding on edges
Suppose we have the following structure:
A --> B, B --> C
A --> D, D --> C

In the A --> B --> C, GPT says drop A --> C but in the A --> D --> C it says keep A --> C. We will keep it for now but I don't know its implications. Think about it later. Shared triangle source and targets (A and C tasks) are common in insurance underwriters. 

In [50]:
# # Get edges to be removed in triangle cases
# edges_to_remove = GPT_trianglesCheck_output[GPT_trianglesCheck_output['answer.ordering'] == "C can only be done after B"]
# edges_to_remove = edges_to_remove[['scenario.task_A', 'scenario.task_C']]

# # Remove redundant edges
# modified_GPT_AM_df = GPT_AM_df.copy()
# modified_GPT_AM_df = pd.merge(modified_GPT_AM_df, edges_to_remove, how='left', 
#                               left_on=['source', 'target'], right_on=['scenario.task_A', 'scenario.task_C'], 
#                               indicator=True)
# modified_GPT_AM_df = modified_GPT_AM_df[modified_GPT_AM_df['_merge'] == 'left_only'].drop(columns=['_merge', 'scenario.task_A', 'scenario.task_C'])
# modified_GPT_AM_df = modified_GPT_AM_df.reset_index(drop=True)
# modified_GPT_AM_df

In [51]:
# # Get edges which survived the triangle check
# survived_edges = GPT_trianglesCheck_output[GPT_trianglesCheck_output['answer.ordering'] == "C can be done after A without having to do B"]
# survived_edges['pair_count'] = survived_edges.groupby(['scenario.task_A', 'scenario.task_C'])['scenario.task_A'].transform('count')
# survived_edges = survived_edges[['scenario.task_A', 'scenario.task_C', 'comment.ordering_comment']]

# # Merge back comments of survived edges
# modified_GPT_AM_df = pd.merge(modified_GPT_AM_df, survived_edges, how='left', 
#                               left_on=['source', 'target'], right_on=['scenario.task_A', 'scenario.task_C'], 
#                               indicator=True)
# modified_GPT_AM_df = modified_GPT_AM_df.rename(columns={'comment.ordering_comment': 'comment_triangles'})
# modified_GPT_AM_df['comment_triangles'] = modified_GPT_AM_df['comment_triangles'].fillna('')
# modified_GPT_AM_df

# # Add triangle comments to the original comments
# modified_GPT_AM_df['comment'] =  modified_GPT_AM_df['comment'] + ' \n GPT comment in Triangle test iteration:\n ' + modified_GPT_AM_df['comment_triangles']
# modified_GPT_AM_df = modified_GPT_AM_df.drop(columns=['_merge', 'scenario.task_A', 'scenario.task_C', 'comment_triangles'])
# modified_GPT_AM_df

In [52]:
# Save output
modified_GPT_AM_df.to_csv(f'{occupation_folder}/{occupation}_trianglesGPT_DAG_df.csv', index=False)