In [1]:
import time
import anthropic
import pandas as pd

from tqdm import tqdm
from sklearn.metrics import f1_score
from utils import html_parsing_ncbi, html_parsing_n2c2, get_classification_report, get_digit, get_macro_average_f1

client = anthropic.Anthropic(
    api_key="sk-ant-apixx-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
)

# 1. NER (Named Entity Recognition)

## 1.1 NCBI-Disease Dataset

### 1.1.1 Inference

In [51]:
ncbi_df = pd.read_csv('data/NER/NCBI-disease/test_200.csv')
ncbi_example_df = pd.read_csv('data/NER/NCBI-disease/examples.csv')

In [52]:

system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to extract disease entities in a sentence."
"INPUT: the input is a sentence."
"OUTPUT: the output is an HTML that highlights all the disease entities in the sentence. The highlighting should only use HTML tags <span style=\"background-color: #FFFF00\"> and </span> and no other tags."
"""

def get_ner_ncbi_disease(sentence: str, shot: int = 0) -> str:
    """
    Get the NER results of NCBI-disease dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the NER results
    """

    user_messages_with_examples = "Here are some examples:"
    for i in range(shot):
        user_messages_with_examples += f"<example>\nINPUT: {ncbi_example_df.iloc[i]['text']}\nOUTPUT: {ncbi_example_df.iloc[i]['label_text']}\n</example>\n"
    user_messages_with_examples += f"INPUT: {sentence}\nOUTPUT: "

    time_start = time.time()
    response = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4096,
        system=system_message,
        messages=[
            {"role": "user", "content": user_messages_with_examples}
        ],
    )
    time_end = time.time()
    return response.content[0].text, time_end - time_start

In [65]:
for i in tqdm(range(0, len(ncbi_df), 1)):
    ncbi_df.loc[i, 'html_claude3_opus_one_shot'], ncbi_df.loc[i, 'claude3_opus_one_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 1)
    ncbi_df.loc[i, 'html_claude3_opus_five_shot'], ncbi_df.loc[i, 'claude3_opus_five_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 5)
    ncbi_df.loc[i, 'html_claude3_opus_ten_shot'], ncbi_df.loc[i, 'claude3_opus_ten_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 10)
    ncbi_df.loc[i, 'html_claude3_opus_twenty_shot'], ncbi_df.loc[i, 'claude3_opus_twenty_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 20)

100%|██████████| 71/71 [31:27<00:00, 26.58s/it]


**Note**: The index `i==89` is dropped due to the safety setting of Gemini, resulting in blocked response.

In [66]:
ncbi_df.dropna(inplace=True)

### 1.1.2 Evaluation

In [51]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# ncbi_df = pd.read_csv("data/NER/NCBI-disease/test_200_claude3_opus_results.csv")

In [67]:
ncbi_df['gt_labels'], ncbi_df['claude3_opus_one_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_claude3_opus_one_shot')
_, ncbi_df['claude3_opus_five_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_claude3_opus_five_shot')
_, ncbi_df['claude3_opus_ten_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_claude3_opus_ten_shot')
_, ncbi_df['claude3_opus_twenty_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_claude3_opus_twenty_shot')

In [68]:
print(f"F1-Score One Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'claude3_opus_one_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1-Score Five Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'claude3_opus_five_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1-Score Ten Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'claude3_opus_ten_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1-Score Twenty Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'claude3_opus_twenty_shot_labels', 'strict')['default']['f1-score']}")

F1-Score One Shot (Strict): 0.7883817427385892
F1-Score Five Shot (Strict): 0.7598784194528875
F1-Score Ten Shot (Strict): 0.7481146304675717
F1-Score Twenty Shot (Strict): 0.7219796215429404


In [69]:
print(f"F1-Score One Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'claude3_opus_one_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1-Score Five Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'claude3_opus_five_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1-Score Ten Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'claude3_opus_ten_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1-Score Twenty Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'claude3_opus_twenty_shot_labels', 'lenient')['default']['f1-score']}")

F1-Score One Shot (Lenient): 0.8741355463347165
F1-Score Five Shot (Lenient): 0.8662613981762918
F1-Score Ten Shot (Lenient): 0.8567119155354449
F1-Score Twenty Shot (Lenient): 0.8791848617176128


In [70]:
print(f"Average Claude3 Opus one-shot prediction time: {ncbi_df['claude3_opus_one_shot_time'].mean():.2f} seconds")
print(f"Average Claude3 Opus five-shot prediction time: {ncbi_df['claude3_opus_five_shot_time'].mean():.2f} seconds")
print(f"Average Claude3 Opus ten-shot prediction time: {ncbi_df['claude3_opus_ten_shot_time'].mean():.2f} seconds")
print(f"Average Claude3 Opus twenty-shot prediction time: {ncbi_df['claude3_opus_twenty_shot_time'].mean():.2f} seconds")

Average Claude3 Opus one-shot prediction time: 5.72 seconds
Average Claude3 Opus five-shot prediction time: 6.66 seconds
Average Claude3 Opus ten-shot prediction time: 6.74 seconds
Average Claude3 Opus twenty-shot prediction time: 7.25 seconds


In [71]:
# save the inference results
ncbi_df.to_csv('data/NER/NCBI-disease/test_200_claude3_opus_results.csv', index=False)

# 1.2 2018 n2c2 Dataset

### 1.2.1 Inference

In [14]:
n2c2_df = pd.read_csv('data/NER/2018_n2c2/test_200.csv')
n2c2_example_df = pd.read_csv('data/NER/2018_n2c2/examples.csv')

In [17]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to extract disease entities in a sentence. The entity type includes Form, Route, Frequency, Dosage, Strength, Duration, Reason, Ade, Drug."
"INPUT: the input is a sentence."
"OUTPUT: the output is an HTML that highlights all the disease entities in the sentence in different colors: Form(#FF0000), Route(#FFA500), Frequency(#FFFF00), Dosage(#00FF00), Strength(#0000FF), Duration(#800080), Reason(#FFC0CB), Ade(#964B00), Drug(#808080) in hex code. The highlighting should only use HTML tags <span style=\"background-color: #XXXXXX\"> and </span> and no other tags."
"""
def get_ner_2018_n2c2(sentence: str, shot: int = 0) -> str:
    """
    Get the NER results of 2018 n2c2 dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the NER results
    """

    user_messages_with_examples = "Here are some examples:"
    for i in range(shot):
        user_messages_with_examples += f"<example>\nINPUT: {n2c2_example_df.iloc[i]['text']}\nOUTPUT: {n2c2_example_df.iloc[i]['label_text']}\n</example>\n"
    user_messages_with_examples += f"INPUT: {sentence}\nOUTPUT: "

    time_start = time.time()
    response = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4096,
        system=system_message,
        messages=[
            {"role": "user", "content": user_messages_with_examples}
        ],
    )
    time_end = time.time()

    return response.content[0].text, time_end - time_start

In [21]:
for i in tqdm(range(0, len(n2c2_df), 1)):
    n2c2_df.loc[i, 'html_claude3_opus_one_shot'], n2c2_df.loc[i, 'claude3_opus_one_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 1)
    n2c2_df.loc[i, 'html_claude3_opus_five_shot'], n2c2_df.loc[i, 'claude3_opus_five_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 5)
    n2c2_df.loc[i, 'html_claude3_opus_ten_shot'], n2c2_df.loc[i, 'claude3_opus_ten_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 10)
    n2c2_df.loc[i, 'html_claude3_opus_twenty_shot'], n2c2_df.loc[i, 'claude3_opus_twenty_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 20)

100%|██████████| 45/45 [25:35<00:00, 34.12s/it]


### 1.2.2 Evaluation

In [None]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# n2c2_df = pd.read_csv("data/NER/2018_n2c2/test_200_claude3_opus_results.csv")

In [22]:
n2c2_df['gt_labels'], n2c2_df['claude3_opus_one_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_claude3_opus_one_shot')
_, n2c2_df['claude3_opus_five_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_claude3_opus_five_shot')
_, n2c2_df['claude3_opus_ten_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_claude3_opus_ten_shot')
_, n2c2_df['claude3_opus_twenty_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_claude3_opus_twenty_shot')

In [23]:
print(f"F1 Score One Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'claude3_opus_one_shot_labels', 'strict'))}")
print(f"F1 Score Five Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'claude3_opus_five_shot_labels', 'strict'))}")
print(f"F1 Score Ten Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'claude3_opus_ten_shot_labels', 'strict'))}")
print(f"F1 Score Twenty Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'claude3_opus_twenty_shot_labels', 'strict'))}")

F1 Score One Shot (Strict): 0.5258183871888957
F1 Score Five Shot (Strict): 0.5991548852716891
F1 Score Ten Shot (Strict): 0.6357379902742611
F1 Score Twenty Shot (Strict): 0.680198870700484


In [24]:
print(f"F1 Score One Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'claude3_opus_one_shot_labels', 'lenient'))}")
print(f"F1 Score Five Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'claude3_opus_five_shot_labels', 'lenient'))}")
print(f"F1 Score Ten Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'claude3_opus_ten_shot_labels', 'lenient'))}")
print(f"F1 Score Twenty Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'claude3_opus_twenty_shot_labels', 'lenient'))}")

F1 Score One Shot (Lenient): 0.6452641366359297
F1 Score Five Shot (Lenient): 0.7519987616355439
F1 Score Ten Shot (Lenient): 0.7744719229166145
F1 Score Twenty Shot (Lenient): 0.7870065881569057


In [25]:
print(f"Average Claude3 Opus one-shot prediction time: {n2c2_df['claude3_opus_one_shot_time'].mean():.2f} seconds")
print(f"Average Claude3 Opus five-shot prediction time: {n2c2_df['claude3_opus_five_shot_time'].mean():.2f} seconds")
print(f"Average Claude3 Opus ten-shot prediction time: {n2c2_df['claude3_opus_ten_shot_time'].mean():.2f} seconds")
print(f"Average Claude3 Opus twenty-shot prediction time: {n2c2_df['claude3_opus_twenty_shot_time'].mean():.2f} seconds")

Average Claude3 Opus one-shot prediction time: 7.48 seconds
Average Claude3 Opus five-shot prediction time: 7.45 seconds
Average Claude3 Opus ten-shot prediction time: 7.68 seconds
Average Claude3 Opus twenty-shot prediction time: 10.00 seconds


In [26]:
# save the inference results
n2c2_df.to_csv('data/NER/2018_n2c2/test_200_claude3_opus_results.csv', index=False)

# 2. RE (Relation Extraction)

## 2.1 2018 n2c2 Dataset

### 2.1.1 Infernece

In [28]:
n2c2_df = pd.read_csv('data/RE/2018_n2c2/test_200.csv')
n2c2_example_df = pd.read_csv('data/RE/2018_n2c2/examples.csv')

In [31]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to classify relations for a sentence."
"INPUT: the input is a sentence where the entities are labeled within [E${X}] and [E${X}/] in a sentence, where X is an integer representing an unique entity."
"OUTPUT: your task is to select one out of the nine types of relations ('STRENGTH-DRUG', 'ROUTE-DRUG', 'FREQUENCY-DRUG', 'FORM-DRUG', 'DOSAGE-DRUG', 'REASON-DRUG', 'DURATION-DRUG', 'ADE-DRUG', and 'No relation')."
"""
def get_re_2018_n2c2(sentence: str, shot: int = 0) -> str:
    """
    Get the RE results of 2018 n2c2 dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the RE results
    """
    
    user_messages_with_examples = "Here are some examples:"
    for i in range(shot):
        user_messages_with_examples += f"<example>\nINPUT: {n2c2_example_df.iloc[i]['text']}\nOUTPUT: {n2c2_example_df.iloc[i]['labels']}\n</example>\n"
    user_messages_with_examples += f"INPUT: {sentence}\nOUTPUT: "

    time_start = time.time()
    response = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4096,
        system=system_message,
        messages=[
            {"role": "user", "content": user_messages_with_examples}
        ],
    )
    time_end = time.time()

    return response.content[0].text, time_end - time_start

In [34]:
for i in tqdm(range(0, len(n2c2_df), 1)):
    n2c2_df.loc[i, 'claude3_opus_one_shot'], n2c2_df.loc[i, 'claude3_opus_one_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 1)
    n2c2_df.loc[i, 'claude3_opus_five_shot'], n2c2_df.loc[i, 'claude3_opus_five_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 5)
    n2c2_df.loc[i, 'claude3_opus_ten_shot'], n2c2_df.loc[i, 'claude3_opus_ten_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 10)
    n2c2_df.loc[i, 'claude3_opus_twenty_shot'], n2c2_df.loc[i, 'claude3_opus_twenty_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 20)

100%|██████████| 7/7 [02:06<00:00, 18.03s/it]


### 2.1.2 Evaluation

In [35]:
# get rid of ' ' if any
n2c2_df['claude3_opus_one_shot'] = n2c2_df['claude3_opus_one_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['claude3_opus_five_shot'] = n2c2_df['claude3_opus_five_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['claude3_opus_ten_shot'] = n2c2_df['claude3_opus_ten_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['claude3_opus_twenty_shot'] = n2c2_df['claude3_opus_twenty_shot'].apply(lambda x: x[1:-1] if "'" in x else x)

In [36]:
# get digit label while considering failed LLM outputs as 'No relation'
n2c2_df['labels'] = n2c2_df['labels'].apply(get_digit)
n2c2_df['claude3_opus_one_shot_labels'] = n2c2_df['claude3_opus_one_shot'].apply(get_digit)
n2c2_df['claude3_opus_five_shot_labels'] = n2c2_df['claude3_opus_five_shot'].apply(get_digit)
n2c2_df['claude3_opus_ten_shot_labels'] = n2c2_df['claude3_opus_ten_shot'].apply(get_digit)
n2c2_df['claude3_opus_twenty_shot_labels'] = n2c2_df['claude3_opus_twenty_shot'].apply(get_digit)

In [95]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# n2c2_df = pd.read_csv("data/RE/2018_n2c2/test_200_claude3_opus_results.csv")

In [37]:
y_true = n2c2_df['labels'].tolist()
y_pred = n2c2_df['claude3_opus_one_shot_labels'].tolist()
print(f"F1 Score One Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['claude3_opus_five_shot_labels'].tolist()
print(f"F1 Score Five Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['claude3_opus_ten_shot_labels'].tolist()
print(f"F1 Score Ten Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['claude3_opus_twenty_shot_labels'].tolist()
print(f"F1 Score Twenty Shot: {f1_score(y_true, y_pred, average='macro')}")

F1 Score One Shot: 0.6972465773472274
F1 Score Five Shot: 0.752918698343535
F1 Score Ten Shot: 0.8328024755093479
F1 Score Twenty Shot: 0.6729275366119646


In [50]:
print(f"Average Claude3 Opus one-shot prediction time: {n2c2_df['claude3_opus_one_shot_time'].mean():.2f} seconds")
print(f"Average Claude3 Opus five-shot prediction time: {n2c2_df['claude3_opus_five_shot_time'].mean():.2f} seconds")
print(f"Average Claude3 Opus ten-shot prediction time: {n2c2_df['claude3_opus_ten_shot_time'].mean():.2f} seconds")
print(f"Average Claude3 Opus twenty-shot prediction time: {n2c2_df['claude3_opus_twenty_shot_time'].mean():.2f} seconds")

Average Claude3 Opus one-shot prediction time: 4.31 seconds
Average Claude3 Opus five-shot prediction time: 4.75 seconds
Average Claude3 Opus ten-shot prediction time: 4.83 seconds
Average Claude3 Opus twenty-shot prediction time: 5.69 seconds


In [40]:
# save the inference results
n2c2_df.to_csv('data/RE/2018_n2c2/test_200_claude3_opus_results.csv', index=False)

## 2.2 GAD

### 2.2.1 Inference

In [41]:
gad_df = pd.read_csv('data/RE/GAD/test_200.csv')
gad_example_df = pd.read_csv('data/RE/GAD/examples.csv')

In [42]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to classify relations between a disease and a gene for a sentence."
"INPUT: the input is a sentence where the disease is labeled as @DISEASE$ and the gene is labeled as @GENE$ accordingly in a sentence. "
"OUTPUT: your task is to select one out of the two types of relations (0 and 1) for the gene and disease without any explanation or other characters: 
0, no relations 
1, has relations"
"""
def get_re_gad(sentence: str, shot: int = 0) -> str:
    """
    Get the RE results of GAD dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the RE results
    """

    user_messages_with_examples = "Here are some examples:"
    for i in range(shot):
        user_messages_with_examples += f"<example>\nINPUT: {gad_example_df.iloc[i]['text']}\nOUTPUT: {gad_example_df.iloc[i]['labels']}\n</example>\n"
    user_messages_with_examples += f"INPUT: {sentence}\nOUTPUT: "

    time_start = time.time()
    response = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=4096,
        system=system_message,
        messages=[
            {"role": "user", "content": user_messages_with_examples}
        ],
    )
    time_end = time.time()

    return response.content[0].text, time_end - time_start

In [43]:
for i in tqdm(range(0, len(gad_df), 1)):
    gad_df.loc[i, 'claude3_opus_one_shot'], gad_df.loc[i, 'claude3_opus_one_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 1)
    gad_df.loc[i, 'claude3_opus_five_shot'], gad_df.loc[i, 'claude3_opus_five_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 5)
    gad_df.loc[i, 'claude3_opus_ten_shot'], gad_df.loc[i, 'claude3_opus_ten_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 10)
    gad_df.loc[i, 'claude3_opus_twenty_shot'], gad_df.loc[i, 'claude3_opus_twenty_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 20)

100%|██████████| 200/200 [48:43<00:00, 14.62s/it]


### 2.2.2 Evaluation

In [44]:
# convert some strings to int while considering failed LLM outputs as 'No relation (0)'
gad_df['claude3_opus_one_shot'] = gad_df['claude3_opus_one_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['claude3_opus_five_shot'] = gad_df['claude3_opus_five_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['claude3_opus_ten_shot'] = gad_df['claude3_opus_ten_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['claude3_opus_twenty_shot'] = gad_df['claude3_opus_twenty_shot'].apply(lambda x: int(x) if x.isdigit() else 0)

In [126]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# gad_df = pd.read_csv("data/RE/GAD/test_200_claude3_opus_results.csv")

In [45]:
y_true = gad_df['labels'].tolist()
y_pred = gad_df['claude3_opus_one_shot'].tolist()
print(f"F1 Score One Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['claude3_opus_five_shot'].tolist()
print(f"F1 Score Five Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['claude3_opus_ten_shot'].tolist()
print(f"F1 Score Ten Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['claude3_opus_twenty_shot'].tolist()
print(f"F1 Score Twenty Shot: {f1_score(y_true, y_pred, average='macro')}")

F1 Score One Shot: 0.4032118055555556
F1 Score Five Shot: 0.4498975299320462
F1 Score Ten Shot: 0.5693108974358975
F1 Score Twenty Shot: 0.4666666666666667


In [49]:
print(f"Average Claude3 Opus one-shot prediction time: {gad_df['claude3_opus_one_shot_time'].mean():.2f} seconds")
print(f"Average Claude3 Opus five-shot prediction time: {gad_df['claude3_opus_five_shot_time'].mean():.2f} seconds")
print(f"Average Claude3 Opus ten-shot prediction time: {gad_df['claude3_opus_ten_shot_time'].mean():.2f} seconds")
print(f"Average Claude3 Opus twenty-shot prediction time: {gad_df['claude3_opus_twenty_shot_time'].mean():.2f} seconds")

Average Claude3 Opus one-shot prediction time: 1.91 seconds
Average Claude3 Opus five-shot prediction time: 3.96 seconds
Average Claude3 Opus ten-shot prediction time: 4.30 seconds
Average Claude3 Opus twenty-shot prediction time: 4.43 seconds


In [48]:
# save the inference results
gad_df.to_csv('data/RE/GAD/test_200_claude3_opus_results.csv', index=False)