In [134]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from openai import OpenAI
import numpy as np
import re

In [None]:
#OPENAI_API_KEY =

# Replicate procedure outline in Appendix B
https://assets.anthropic.com/m/2e23255f1e84ca97/original/Economic_Tasks_AI_Paper.pdf



## Step 1: Embed task names.
Embeds tasks names using the all-mpnet-base-v2 [Reimers and Gurevych, 2022] sentence transformer to obtain 768-dimensional vector representations of each task

In [2]:
onet_data = '../../data/external/onet_data/'

In [17]:
# read in ONET task data
df = pd.read_csv(onet_data + 'start_sample_dwa_task_list.csv')

# create a small dataset to just test hte workflow
df = df.loc[0:100]

In [18]:
# get task names
task_names = df['Task']

In [19]:
# load model and create embeddings
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(task_names)
print(embeddings)


[[-5.1114074e-04 -1.0107846e-02  1.5378774e-02 ... -2.3617710e-04
   2.2474973e-02 -2.5179697e-02]
 [-2.6967600e-03  1.6098229e-02 -9.4099334e-03 ... -3.7298994e-03
   2.5657978e-02 -1.7094379e-02]
 [ 2.0684568e-02  7.8259163e-02 -1.4048434e-02 ... -1.4728837e-02
   3.5283066e-02 -3.2388285e-02]
 ...
 [ 6.9529973e-02  1.1266973e-01 -5.1428413e-04 ...  2.7145812e-02
  -1.5411461e-03  6.5058313e-02]
 [-3.0856816e-02  6.5693356e-02 -1.9286772e-02 ... -3.8616525e-04
  -5.3530548e-02 -2.3597579e-02]
 [ 5.1545338e-03  2.6121160e-02  3.1633841e-05 ... -1.0486379e-02
  -1.7431175e-02 -7.3188781e-03]]


## Step 2: Generate neighborhoods.
Group these embeddings into k neighborhoods using k-means clustering, where k is chosen so that the average number of tasks per neighborhood is 40.
We group tasks into neighborhoods because the names and descriptions for all base clusters may not fit within Claude’s context window.

In [20]:
# set parameters

avg_num_tasks = 2
k = int(df.shape[0]/avg_num_tasks)

kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(embeddings)


In [21]:
clusters

array([28, 14, 14,  4,  4, 14,  4,  4,  4, 28, 23, 23, 22,  8, 23,  8,  8,
       23,  3,  3, 24,  3,  3, 37, 33,  9,  9,  0,  9, 18, 18, 38,  2,  2,
       38, 38,  2, 39, 16, 16, 12, 12,  1,  1, 27, 27, 12, 12, 27, 31, 29,
       11, 27, 20, 20, 30, 20,  5,  5,  5, 35,  6, 26, 32, 15, 15, 13, 34,
       21, 10, 36, 10, 25, 15, 21,  2,  7,  2, 11, 19, 17], dtype=int32)

## Intermediate step: Generate name and description.

Not mentioned in the latest paper but in the Clio paper on p. 39

In [109]:
# define number of tasks outside cluster to be considered
# define m (not given in the paper?!)
#m = int(avg_num_tasks*0.2)
m =2

In [111]:
system_prompt = """You are tasked with summarizing a group of related statements into a short, precise,
and accurate description and name. Your goal is to create a concise summary
that captures the essence of these statements and distinguishes them from other
similar groups of statements."""

In [112]:
user_prompt = f"""ArithmeticErrorSummarize all the statements into a clear, precise, two-sentence description in the
past tense. Your summary should be specific to this group and distinguish it
from the contrastive answers of the other groups.
After creating the summary, generate a short name for the group of statements. This
name should be at most ten words long (perhaps less) and be specific but also
reflective of most of the statements (rather than reflecting only one or two).
The name should distinguish this group from the contrastive examples. For
instance, "Write fantasy sexual roleplay with octopi and monsters", "Generate
blog spam for gambling websites", or "Assist with high school math homework"
would be better and more actionable than general terms like "Write erotic
content" or "Help with homework". Be as descriptive as possible and assume
neither good nor bad faith. Do not hesitate to identify and describe socially
harmful or sensitive topics specifically; specificity is necessary for
monitoring.
Present your output in the following format:
<summary> [Insert your two-sentence summary here] </summary>
<name> [Insert your generated short name here] </name>

Below are the related statements:

{tasks_cluster}

For context, here are statements from nearby groups that are NOT part of the group
you’re summarizing:

{tasks_closest}
Do not elaborate beyond what you say in the tags. Remember to analyze both the
statements and the contrastive statements carefully to ensure your summary and
name accurately represent the specific group while distinguishing it from
others."""

In [114]:
assistant_prompt = """Sure, I will provide a clear, precise, and accurate summary and name for
this cluster. I will be descriptive and assume neither good nor bad faith. Here
is the summary, which I will follow with the name:"""

In [None]:

client = OpenAI(
    #api_key=os.environ.get("OPENAI_API_KEY"),  # This is the default and can be omitted
    api_key = OPENAI_API_KEY
)


In [148]:
# loop throuch clusters
cluster_descriptions = pd.DataFrame()
for cluster in clusters:
    # get tasks in this cluster
    tasks_cluster = list(df['Task'][clusters==cluster])

    # get cluster centroid
    cluster_centroid = kmeans.cluster_centers_[cluster]

    # get m closest tasks outside the clusters
    embeddings_not_in_cluster = embeddings[clusters!=cluster]
    distances = np.linalg.norm(embeddings_not_in_cluster - cluster_centroid, axis=1)
    closest = distances.argsort()[:m]
    tasks_closest = list(df['Task'][closest])

    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "developer", "content": system_prompt},
            {
                "role": "user",
                "content": user_prompt
            },
            {"role": "assistant", "content":assistant_prompt},

        ],
        model="gpt-3.5-turbo",
        temperature = 1
    )
    answer = chat_completion.choices[0].message.content

    cluster_descriptions = pd.concat([cluster_descriptions, pd.Series([cluster,answer])],axis=1)
cluster_descriptions = cluster_descriptions.T
cluster_descriptions = cluster_descriptions.rename(columns={0:'cluster', 1:'answer'})

In [149]:
 # Regular expression patterns to match summary and name
summary_pattern = r'<summary>\s*(.*?)\s*</summary>'
name_pattern = r'<name>\s*(.*?)\s*</name>'

# Function to extract summary and name
def extract_summary_and_name(text):
    summary_match = re.search(summary_pattern, text)
    name_match = re.search(name_pattern, text)
    
    summary_text = summary_match.group(1) if summary_match else None
    name_text = name_match.group(1) if name_match else None
    
    return pd.Series([summary_text, name_text])


In [152]:
cluster_descriptions[['summary', 'name']] = cluster_descriptions['answer'].apply(extract_summary_and_name)

In [154]:
cluster_descriptions.to_csv('../../data/interim/anthropic_replication/base_summary_name.csv')

## Propose new tasks for each neighborhood.
For each neighborhood, use Claude to propose
candidate higher-level task descriptions by examining both the tasks within the neighborhood and the nearest m tasks outside it. Including the nearest tasks beyond the neighborhood ensures that tasks (or groups of tasks) on the boundary between neighborhoods are neither
overcounted nor undercounted. 

In [155]:
# hierarchy parameters
# number of levels
L = 3
# number of observations no the base level
n_base = df.shape[0]

# number of observations at the top hierachy level
n_top = 12

# desired level ratio asdefined in the paper
level_ratio = (n_top/n_base)**(1/(L-1)) 


Use ChatGPT to propose new task for each neighborhood.

In [156]:
# clusters_df = pd.concat([df['Task'], pd.Series(clusters).rename('cluster_number')],axis = 1)
# clusters_df['cluster'] = 'Cluster '+ clusters_df['cluster_number'].astype(str)
# clusters_df = clusters_df.sort_values('cluster_number')

In [157]:
system_prompt = """You are tasked with creating higher-level cluster names based on a
given list of clusters and their descriptions. Your goal is to come up
with broader categories that could encompass one or more of the provided
clusters."""

In [158]:
desired_names = level_ratio * n_base
cluster_list = list(cluster_descriptions['name'].astype(str) + ': ' +cluster_descriptions['summary'].astype(str))

In [159]:
user_prompt = f"""First, review the list of clusters and their descriptions: 
{cluster_list} 


Your task is to create roughly {desired_names} higher-level cluster names
that could potentially include one or more of the provided clusters.
These higher-level clusters should represent broader categories or
themes that emerge from the given clusters, while remaining as specific
as possible. If there are many clusters with a specific theme, ensure
that the higher-level cluster name remains the maximum level of
specificity. You are helping to organize user behavior data in order to
improve safety, monitoring, and observability. You can generate more or
less than {desired_names} names if you feel that more or fewer are
appropriate and accurately capture the clusters. You should output at
least {int(0.5 * desired_names)} and at most {int(1.5 * desired_names)}
names, with {desired_names} as a target.
Guidelines for creating higher-level cluster names:
1. Analyze the themes, topics, or characteristics common to multiple
clusters.
2. Create names that are specific enough to be meaningful, but not so
specific that they can’t meaningfully represent many different clusters.
Avoid overly general or vague terms, and do not hesitate to describe
socially harmful or sensitive topics (in fact, clusters that clearly
describe harmful behavior are slightly preferred); specificity is
necessary for observability and enforcement.
3. Ensure that the higher-level cluster names are distinct from one another.
4. Use clear, concise, and descriptive language for the cluster names.
Assume neither good nor bad faith for the content in the clusters.
Now, provide your list of roughly {desired_names} higher-level cluster names.
Present your answer in the following format:
<answer>
1. [First higher-level cluster name]
2. [Second higher-level cluster name]
3. [Third higher-level cluster name]
...
{desired_names}. [Last higher-level cluster name]
</answer>
Focus on creating meaningful, distinct, and precise (but not overly specific
) higher-level cluster names that could encompass multiple sub-clusters.

"""

In [160]:
assistant_prompt = """I understand. I’ll evaluate the clusters and provide higher-level
cluster names that could encompass multiple sub-clusters."""

In [None]:

chat_completion = client.chat.completions.create(
     messages=[
        {"role": "developer", "content": system_prompt},
        {
            "role": "user",
            "content": user_prompt
        },
        {"role": "assistant", "content":assistant_prompt},

    ],
    model="gpt-3.5-turbo",
)

In [164]:
cluster_names = chat_completion.choices[0].message.content

In [170]:
cleaned_text = re.sub(r'<answer>|</answer>', '', cluster_names)

# Step 2: Extract the cluster names using a regex pattern
pattern = r'\d+\.\s*(.*)'  # Match the number and extract the cluster name

# Find all matches
cluster_list = re.findall(pattern, cleaned_text)


In [173]:
cluster_list 

['Agricultural Disease Research',
 'Bee Health and Disease Studies',
 'Plant and Pollination Research',
 'Apiary Science Investigations',
 'Integrated Agricultural Research',
 'Bee Disease and Yield Studies',
 'Pollinator Health Assessments',
 'Agricultural Entomology Studies',
 'Bee Ecology and Plant Health Research',
 'Agricultural Research Initiatives',
 'Bee and Plant Health Investigations',
 'Field Biology Assessments',
 'Bee Disease Experimentation',
 'Agronomic Research on Bees and Plants',
 'Entomological Survey and Analysis',
 'Sustainable Agriculture Investigations',
 'Pollen and Nectar Yield Studies',
 'Bee Health Ecology Research',
 'Agrarian Health and Disease Surveys',
 'Bee Pollination Dynamics',
 'Plant Pathology and Bee Health Research',
 'Integrated Pest Management Studies',
 'Agricultural Ecosystem Health Research',
 'Pollinator Population Dynamics',
 'Crop Yield Improvement Research',
 'Bee Health and Disease Ecology',
 'Environmental Impact Assessments',
 'Sustaina

In [172]:
len(cluster_list)

31

## Deduplicate across neighborhoods.

In [176]:
user_prompt = f"""You are tasked with deduplicating a list of cluster names into a
smaller set of distinct cluster names. Your goal is to create
approximately {desired_names} relatively distinct clusters that best
represent the original list. You are helping to organize user behavior
data in order to improve safety, monitoring, and observability. Here are
the inputs: 
{cluster_list}

Number of distinct clusters to create: approximately {desired_names}
Follow these steps to complete the task:
1. Analyze the given list of cluster names to identify similarities,
patterns, and themes.
2. Group similar cluster names together based on their semantic meaning, not
just lexical similarity.
3. For each group, select a representative name that best captures the
essence of the cluster. This can be one of the original names or a new
name that summarizes the group effectively. Do not just pick the most
vague or generic name.
4. Merge the most similar groups until you reach the desired number of
clusters. Maintain as much specificity as possible while merging.
6. Ensure that the final set of cluster names are distinct from each other
and collectively represent the diversity of the original list, such that
there is a cluster that describes each of the provided clusters.
7. If you create new names for any clusters, make sure they are clear,
concise, and reflective of the contents they represent.
42
8. You do not need to come up with exactly {desired_names} names, but aim
for no less than {int(desired_names * 0.5)} and no more than {int(
desired_names * 1.5)}. Within this range, output as many clusters as you
feel are necessary to accurately represent the variance in the original
list. Avoid outputting duplicate or near-duplicate clusters.
9. Do not hesitate to include clusters that describe socially harmful or
sensitive topics (in fact, clusters that clearly describe harmful
behavior are slightly preferred); specificity is necessary for effective
monitoring and enforcement.
10. Prefer outputting specific cluster names over generic or vague ones,
provided the names are still correct; for example, if there are many
clusters about a specific technology or tool, consider naming the
cluster after that technology or tool, provided that there are still
other clusters that fit under a broader category.
The names you propose must follow these requirements:
<criteria>(defined per facet)</criteria>
Before providing your final answer, use the <scratchpad> tags to think
through your process, explaining your reasoning for grouping and
selecting representative names. Spend no more than a few paragraphs in
your scratchpad.
Present your final answer in the following format:

<answer>
1. [First cluster name]
2. [Second cluster name]
3. [Third cluster name]
...
N. [Nth cluster name]
</answer>
Remember, your goal is to create approximately {desired_names} relatively
distinct cluster names that best represent the original list. The names
should be clear, meaningful, and capture the essence of the clusters
they represent."""

In [177]:
assistant_prompt =f"""I understand. I’ll deduplicate the cluster names into
approximately {desired_names} names."""

In [178]:
chat_completion = client.chat.completions.create(
     messages=[
        {
            "role": "user",
            "content": user_prompt
        },
        {"role": "assistant", "content":assistant_prompt},

    ],
    model="gpt-3.5-turbo",
)

In [179]:
chat_completion.choices[0].message.content

"<scratchpad>\nAfter analyzing the list of cluster names provided, I can see that there are several distinct themes present. These themes revolve around agricultural research, bee health, pollination studies, and environmental impact assessments. \n\nI will group together similar cluster names that fall under these themes and select representative names that capture the essence of each group. I will then merge the most similar groups while maintaining specificity to ensure the final set of cluster names is distinct and collectively represents the diversity of the original list. I will prioritize specific and clear names over generic ones to accurately describe the content of each cluster.\n\nI will start by identifying the key themes and then consolidate them into a smaller set of representative cluster names that best capture the original list's variance and diversity.\n</scratchpad>\n\n<answer>\n1. Agricultural Disease Research\n2. Integrated Agricultural Research\n3. Sustainable Agr