In [1]:
#Python
import os
import getpass
import glob
import numpy as np
import pandas as pd
import random
import networkx as nx
import requests
import re

## formatting number to appear comma separated and with two digits after decimal: e.g, 1000 shown as 1,000.00
pd.set_option('float_format', "{:,.2f}".format)

import matplotlib.pyplot as plt
#%matplotlib inline
#from matplotlib.legend import Legend

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 200)

In [2]:
import itertools 
import networkx as nx
from pyvis.network import Network

In [3]:
import copy
import random

from edsl.questions import QuestionCheckBox, QuestionFreeText
from edsl import Scenario, Model
from edsl.questions import QuestionMultipleChoice
from itertools import combinations
from edsl.questions.derived.QuestionLinearScale import QuestionLinearScale
from textwrap import dedent

Model.available()
m35 = Model('gpt-3.5-turbo')
m4 = Model('gpt-4o')

In [4]:
# determine user
user = getpass.getuser()
if user == 'peymansh':
    main_folder_path = '/Users/peymansh/Dropbox (MIT)/Research/AI and Occupations/ai-exposure'
    data_path = f'{main_folder_path}/output'
    output_path = f'{main_folder_path}/output/daily_tasks_occupations_analysis'

# Functions

In [5]:
def create_adjacency_matrix(adjacency_matrix_df):
    # keep names for node labels
    tasks = adjacency_matrix_df.columns.tolist()
    tasks = [x.split()[0] for x in tasks]
    node_labels = {value: index for value, index in enumerate(tasks)}
    print(node_labels) 

    # convert adjacency matrix to numpy array for graph depiction
    adjacency_matrix = adjacency_matrix_df.values
    adjacency_matrix = np.nan_to_num(adjacency_matrix, nan=0.0)
    return adjacency_matrix


# Hacky way of fixing "Sink" node
def add_sink_node(dag_matrix, occupation):
    n = dag_matrix.shape[0]
    tasks = dag_matrix.columns.tolist()
    
    if '"Sink"' in tasks:
        return dag_matrix
    elif occupation == 'travelAgents':
        dag_matrix.loc['"Sink"'] = [0] * n
        dag_matrix['"Sink"'] = [0, 0, 0, 0, 0, 0, 1, 1, 0]
    elif occupation == 'insuranceUnderwriters':
        dag_matrix.loc['"Sink"'] = [0] * n
        dag_matrix['"Sink"'] = [1, 0, 0, 1, 0, 1, 0, 0]
    
    return dag_matrix 

In [6]:
occupation = 'travelAgents'
GPT_input_occupation = 'Travel agent'
plot_title_occupation = 'Travel Agents'


# Read the adjacency matrix
file_path = f'{data_path}/daily_tasks_occupations_analysis/{occupation}'
adjacency_matrix = pd.read_csv(f'{file_path}/{occupation}_AM.csv', index_col=0)
adjacency_matrix = add_sink_node(adjacency_matrix, occupation)


# create dictrionary for later
tasks_dict = {i: row_name for i, row_name in enumerate(adjacency_matrix.index)}
tasks_dict = {value: key for key, value in tasks_dict.items()}


In [7]:
occupation = 'travelAgents'
GPT_input_occupation = 'Travel agent'
plot_title_occupation = 'Travel Agents'


# Read the adjacency matrix
file_path = f'{data_path}/daily_tasks_occupations_analysis/{occupation}'
adjacency_matrix = pd.read_csv(f'{file_path}/{occupation}_AM.csv', index_col=0)
adjacency_matrix = add_sink_node(adjacency_matrix, occupation)
tasks = adjacency_matrix.columns.tolist()


# Convert adjacency matrix into proper array
adjacency_matrix = adjacency_matrix.values
adjacency_matrix = np.nan_to_num(adjacency_matrix, nan=0.0)

In [8]:
# Vertex labels
vertices = tasks.copy()

# Initialize an empty list to store edges
edges = []

# Convert the adjacency matrix to an edge list
for i in range(len(vertices)):
    for j in range(len(vertices)):
        if adjacency_matrix[i, j] == 1:
            edges.append([vertices[i], vertices[j]])

# Convert the edge list to a DataFrame
df = pd.DataFrame(edges, columns=['source', 'target'])
df['color'] = 'blue'
df['edge_label'] = 'in Manual and GPT'

# GPT

def draw_dag_line(occupation, focal_task, all_tasks):
    tasks = copy.deepcopy(all_tasks)
    tasks.remove(focal_task)
    q = QuestionCheckBox(
        question_name = "dag",
        question_text = dedent("""\
            Consider this {{ occupation }}. 
            And consider this task: {{ task }}. 
            Of the following tasks, which task is this task an input to?
            Check all that apply. Explain your reasoning in comments.
            """),
        question_options = tasks
    )
    scenario = Scenario({'occupation':occupation, 'task': focal_task})
    result = q.by(m4).by(scenario).run()
    return result



def dict_to_matrix(dictionary):
    from collections import OrderedDict

    # Use OrderedDict to maintain the insertion order of keys
    unique_keys = OrderedDict.fromkeys(dictionary.keys())
    unique_values = OrderedDict.fromkeys(val for values in dictionary.values() for val in values)
    
    # Combine keys and values into a single list while maintaining their order
    all_labels = list(unique_keys) + [val for val in unique_values if val not in unique_keys]
    
    # Create an index map for the labels
    label_index = {label: idx for idx, label in enumerate(all_labels)}
    
    # Initialize the matrix with zeros
    size = len(all_labels)
    matrix = np.zeros((size, size), dtype=int)
    
    # Populate the matrix
    for key, values in dictionary.items():
        i = label_index[key]
        for value in values:
            j = label_index[value]
            matrix[i][j] = 1
    
    return matrix, all_labels



def create_dag(occupation, tasks):
    dag = dict({})
    GPT_comments = dict({})
    for task in tasks: 
        result = draw_dag_line(occupation, task, tasks)
        dag[task] = result.select("dag").first()
        GPT_comments[task] = result.select("comment.dag_comment")
        result.select("dag").print()
        result.select("comment.dag_comment").print()


    # Convert DAG from dictionary to matrix
    dag_matrix, dag_matrix_labels = dict_to_matrix(dag)

    return dag, dag_matrix, dag_matrix_labels, GPT_comments



def export_DAG_matrix_to_csv(matrix, labels, filename):
    # Create a DataFrame from the matrix
    df = pd.DataFrame(matrix, index=labels, columns=labels)
    
    # Export the DataFrame to a CSV file
    df.to_csv(filename)

# Prompt GPT and create DAG, given first and last tasks
if '"Sink"' in tasks:
    tasks.remove('"Sink"')
GPT_dag_matrix_raw, GPT_dag_matrix_labels, GPT_comments = create_dag(GPT_input_occupation, tasks)

# export comments
GPT_comments

In [9]:
# Read the adjacency matrix
file_path = f'{data_path}/daily_tasks_occupations_analysis/{occupation}'
GPT_dag_matrix = pd.read_csv(f'{file_path}/{occupation}_GPT_DAG.csv', index_col=0)
GPT_dag_matrix = add_sink_node(GPT_dag_matrix, occupation)


# convert adjacency matrix to numpy array for graph depiction
GPT_dag_matrix = GPT_dag_matrix.values
GPT_dag_matrix = np.nan_to_num(GPT_dag_matrix, nan=0.0)

## The Conditioning Approach (now for the easiest possible case: Triangles)

<br>

### Find all "triangles", defined as cases with:
#### A --> B --> C
#### A --> C

In [10]:
def find_triangles(matrix):
    # get length of matrix
    n = matrix.shape[0]

    # create list containing integers from 0 to n-1 for indexing
    numbers = list(range(n))

    # Find triangles
    triangles = []
    for x, y, z in itertools.permutations(numbers, 3):
        # get indices of destination nodes for outgoing edges of x
        out_edges_destination_x = np.where(matrix[x] == 1)[0]
        out_edges_destination_x = list(out_edges_destination_x)

        # check if x has outgoing edge to both y and z
        # if yes, check if y has outgoing edge to z
        if y in out_edges_destination_x and z in out_edges_destination_x:
            out_edges_destination_y = np.where(matrix[y] == 1)[0]
            out_edges_destination_y = list(out_edges_destination_y)
            
            # check if y has outgoing edge to z
            # if yes, we have a triangle
            if z in out_edges_destination_y:
                triangles.append([x, y, z])
    
    return triangles

# Find triangles
GPT_dag_triangles = find_triangles(GPT_dag_matrix) # remove the "Sink" node?
print(f'Examples of triangles: {GPT_dag_triangles[:5]}')
print(f'Count of triangle cases: {len(GPT_dag_triangles)}')

Examples of triangles: [[1, 2, 3], [1, 4, 3], [2, 3, 0], [3, 0, 6], [4, 0, 6]]
Count of triangle cases: 17


### Ask GPT whether conditional on having B --> C we need A --> C

In [11]:
def triangle_check(occupation, tasks, triangles_list):
    triangles = np.array(triangles_list)
    task_A_list = triangles[:, 0]
    task_B_list = triangles[:, 1]
    task_C_list = triangles[:, 2]
    scenarios = [Scenario({"occupation": occupation, "task_A": tasks[task_A], "task_B": tasks[task_B], "task_C": tasks[task_C]}) 
        for task_A, task_B, task_C in zip(task_A_list, task_B_list, task_C_list)]

    q = QuestionMultipleChoice(
        question_name = "ordering",
        question_text = dedent("""\
            Consider this {{ occupation }}. 
            And consider these three tasks: 
            A) {{ task_A }} 
            B) {{ task_B }}
            C) {{ task_C }} 
            What are the prerequisites of doing task C?
            """),
        question_options = [
            "C can be done after A without having to do B",
            "C can only be done after B",
            "These are not part of the same task sequence"]
    )
    results = q.by(m4).by(scenarios).run(progress_bar = True)
    return results

results = triangle_check(GPT_input_occupation, tasks, GPT_dag_triangles)
results.select("task_A", "task_B", "task_C", "ordering").print()

Output()

scenario.task_A,scenario.task_B,scenario.task_C,answer.ordering
"Converse with customer to determine destination, mode of transportation, travel dates, financial considerations, and accommodations required.","Compute cost of travel and accommodations, using calculator, computer, carrier tariff books, and hotel rate books, or quote package tour's costs.","Book transportation and hotel reservations, using computer or telephone.",C can only be done after B
"Converse with customer to determine destination, mode of transportation, travel dates, financial considerations, and accommodations required.","Plan, describe, arrange, and sell itinerary tour packages and promotional travel incentives offered by various travel carriers.","Book transportation and hotel reservations, using computer or telephone.",C can be done after A without having to do B
"Compute cost of travel and accommodations, using calculator, computer, carrier tariff books, and hotel rate books, or quote package tour's costs.","Book transportation and hotel reservations, using computer or telephone.",Collect payment for transportation and accommodations from customer.,C can only be done after B
"Book transportation and hotel reservations, using computer or telephone.",Collect payment for transportation and accommodations from customer.,"Print or request transportation carrier tickets, using computer printer system or system link to travel carrier.",C can be done after A without having to do B
"Plan, describe, arrange, and sell itinerary tour packages and promotional travel incentives offered by various travel carriers.",Collect payment for transportation and accommodations from customer.,"Print or request transportation carrier tickets, using computer printer system or system link to travel carrier.",C can only be done after B
"Plan, describe, arrange, and sell itinerary tour packages and promotional travel incentives offered by various travel carriers.","Book transportation and hotel reservations, using computer or telephone.",Collect payment for transportation and accommodations from customer.,C can only be done after B
"Plan, describe, arrange, and sell itinerary tour packages and promotional travel incentives offered by various travel carriers.","Book transportation and hotel reservations, using computer or telephone.","Print or request transportation carrier tickets, using computer printer system or system link to travel carrier.",C can only be done after B
"Provide customer with brochures and publications containing travel information, such as local customs, points of interest, or foreign country regulations.","Converse with customer to determine destination, mode of transportation, travel dates, financial considerations, and accommodations required.","Plan, describe, arrange, and sell itinerary tour packages and promotional travel incentives offered by various travel carriers.",C can only be done after B
"Record and maintain information on clients, vendors, and travel packages.",Collect payment for transportation and accommodations from customer.,"Print or request transportation carrier tickets, using computer printer system or system link to travel carrier.",C can only be done after B
"Record and maintain information on clients, vendors, and travel packages.","Compute cost of travel and accommodations, using calculator, computer, carrier tariff books, and hotel rate books, or quote package tour's costs.",Collect payment for transportation and accommodations from customer.,C can only be done after B


In [12]:
# Convert results to pandas df
GPT_trianglesCheck_df = results.select("task_A", "task_B", "task_C", "ordering").to_pandas()
GPT_trianglesCheck_df = GPT_trianglesCheck_df.sort_values(by=['scenario.task_A', 'scenario.task_B', 'scenario.task_C'])

# Get edges to be removed in triangle cases
edges_to_remove_df = GPT_trianglesCheck_df[GPT_trianglesCheck_df['answer.ordering'] == "C can only be done after B"]

In [13]:
# Remove redundant edges
modified_GPT_dag_matrix = GPT_dag_matrix.copy()
for task_A, task_C in zip(edges_to_remove_df['scenario.task_A'], edges_to_remove_df['scenario.task_C']):
    modified_GPT_dag_matrix[tasks_dict[task_A], tasks_dict[task_C]] = 0
modified_GPT_dag_matrix

array([[0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [14]:
# remove the sink node from Manual version if it's present
if adjacency_matrix.shape[0] != GPT_dag_matrix.shape[0]:
    adjacency_matrix = adjacency_matrix[:-1,:-1]

In [15]:
# Subtract the GPT matrix from Manual
diff_matrix = adjacency_matrix - GPT_dag_matrix

# subset edges which are in Manual DAG but not in GPT's
inManual_notInGPT = diff_matrix.copy()
inManual_notInGPT[inManual_notInGPT < 0] = 0

# subset edges which are in GPT's DAG but not in Manual's
inGPT_notInManual = diff_matrix.copy()
inGPT_notInManual[inGPT_notInManual > 0] = 0
inGPT_notInManual[inGPT_notInManual < 0] = 1

In [16]:
# Vertex labels
vertices = tasks.copy()

# Initialize an empty list to store edges
edges = []

# Convert the adjacency matrix to an edge list
for i in range(len(vertices)):
    for j in range(len(vertices)):
        if inManual_notInGPT[i, j] == 1:
            edges.append([vertices[i], vertices[j]])

# Convert the edge list to a DataFrame
inManual_notInGPT_df = pd.DataFrame(edges, columns=['source', 'target'])
inManual_notInGPT_df['color'] = 'red'
inManual_notInGPT_df['edge_label'] = 'only in Manual'

In [17]:
# Vertex labels
vertices = tasks.copy()

# Initialize an empty list to store edges
edges = []

# Convert the adjacency matrix to an edge list
for i in range(len(vertices)):
    for j in range(len(vertices)):
        if inGPT_notInManual[i, j] == 1:
            edges.append([vertices[i], vertices[j]])

# Convert the edge list to a DataFrame
inGPT_notInManual_df = pd.DataFrame(edges, columns=['source', 'target'])
inGPT_notInManual_df['color'] = 'green'
inGPT_notInManual_df['edge_label'] = 'only in GPT (include GPT comment here)'

## Plot interactive graph

In [18]:
df = pd.concat([df, inManual_notInGPT_df, inGPT_notInManual_df], axis=0)
df = df.drop_duplicates(subset=['source', 'target'], keep='last')

In [19]:
# Create a directed graph
G = nx.DiGraph()

# Add edges to the graph
for index, row in df.iterrows():
    G.add_edge(row['source'], row['target'], label=row['edge_label'], title=row['edge_label'])

# Define fixed positions for the nodes
fixed_positions = {
    'Collect payment for transportation and accommodations from customer.': 
    (0, -100),
    'Converse with customer to determine destination, mode of transportation, travel dates, financial considerations, and accommodations required.': 
    (-400, 200),
     "Compute cost of travel and accommodations, using calculator, computer, carrier tariff books, and hotel rate books, or quote package tour's costs.": 
    (-200, -100),
    'Book transportation and hotel reservations, using computer or telephone.': 
    (100, 0),
    'Plan, describe, arrange, and sell itinerary tour packages and promotional travel incentives offered by various travel carriers.':
    (-300, 0),
    'Provide customer with brochures and publications containing travel information, such as local customs, points of interest, or foreign country regulations.':
    (-400, -200),
    'Print or request transportation carrier tickets, using computer printer system or system link to travel carrier.':
    (200, -200),
    'Record and maintain information on clients, vendors, and travel packages.':
    (200, 200),
    '"Sink"':
    (300, 0)
}

# Create a Pyvis network
net = Network(notebook=True, directed=True, cdn_resources="remote",
              height = "800px",
                width = "125%",
                select_menu = True,
                filter_menu = True,)

# Add nodes with fixed positions and labels
for node, (x, y) in fixed_positions.items():
    net.add_node(node, label=node.split(" ")[0], title=node, x=x, y=y, fixed=True, borderWidthSelected=10)

# Add edges with labels
for index, row in df.iterrows():
    net.add_edge(row['source'], row['target'], title=row['edge_label'], color=row['color'])

# Save interactive graph
net.save_graph(f'{file_path}/{occupation}_DAG.html')