In [103]:
import time
import vertexai
import pandas as pd

from tqdm import tqdm
from sklearn.metrics import f1_score
from vertexai.preview.generative_models import GenerativeModel, GenerationConfig, Part, Content
from google.cloud.aiplatform_v1beta1.types.content import SafetySetting
from utils import html_parsing_ncbi, html_parsing_n2c2, get_classification_report, get_digit, get_macro_average_f1

vertexai.init(project='xxx-xxxxxxx-xxxxxx') 
chat_model = GenerativeModel('gemini-pro')

config = GenerationConfig(
    temperature=0.0,
    candidate_count=1,
    top_p=0.95,
    top_k=1,
    max_output_tokens=4096,
)

safety_settings = [
    SafetySetting({
        "category": "HARM_CATEGORY_UNSPECIFIED",
        "threshold": "BLOCK_NONE",
    }),
    SafetySetting({
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    }),
    SafetySetting({
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    }),
    SafetySetting({
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    }),
    SafetySetting({
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    }),
]

# 1. NER (Named Entity Recognition)

## 1.1 NCBI-Disease Dataset

### 1.1.1 Inference

In [2]:
ncbi_df = pd.read_csv('data/NER/NCBI-disease/test_200.csv')
ncbi_example_df = pd.read_csv('data/NER/NCBI-disease/examples.csv')

In [35]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to extract disease entities in a sentence."
"INPUT: the input is a sentence."
"OUTPUT: the output is an HTML that highlights all the disease entities in the sentence. The highlighting should only use HTML tags <span style=\"background-color: #FFFF00\"> and </span> and no other tags."
"""

def get_ner_ncbi_disease(sentence: str, shot: int = 0) -> str:
    """
    Get the NER results of NCBI-disease dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the NER results
    """

    # system prompt
    history = [
        Content(role="user", parts=[Part.from_text(system_message)]),
        Content(role="model", parts=[Part.from_text("Understood.")])
    ]
    for i in range(shot):
        history.extend([
            Content(role="user", parts=[Part.from_text(ncbi_example_df.iloc[i]['text'])]),
            Content(role="model", parts=[Part.from_text(ncbi_example_df.iloc[i]['label_text'])]) 
        ])

    # Init a new chat session
    chat = chat_model.start_chat(history=history)

    time_start = time.time()
    response = chat.send_message(
        sentence,
        generation_config=config,
        safety_settings=safety_settings,
        stream=False,
    )
    time_end = time.time()

    return response.text, time_end - time_start

In [None]:
for i in tqdm(range(0, len(ncbi_df), 1)):
    if (i + 1) % 15 == 0: # Gemini API has a quota limit
        time.sleep(65)
    ncbi_df.loc[i, 'html_gemini_pro_one_shot'], ncbi_df.loc[i, 'gemini_pro_one_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 1)
    ncbi_df.loc[i, 'html_gemini_pro_five_shot'], ncbi_df.loc[i, 'gemini_pro_five_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 5)
    ncbi_df.loc[i, 'html_gemini_pro_ten_shot'], ncbi_df.loc[i, 'gemini_pro_ten_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 10)
    ncbi_df.loc[i, 'html_gemini_pro_twenty_shot'], ncbi_df.loc[i, 'gemini_pro_twenty_shot_time'] = get_ner_ncbi_disease(ncbi_df.loc[i, 'text'], 20)

**Note**: The index `i==89` is dropped due to the safety setting of Gemini, resulting in blocked response.

In [None]:
ncbi_df.dropna(inplace=True)

### 1.1.2 Evaluation

In [51]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# ncbi_df = pd.read_csv("data/NER/NCBI-disease/test_200_gemini_pro_results.csv")

In [53]:
ncbi_df['gt_labels'], ncbi_df['gemini_pro_one_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gemini_pro_one_shot')
_, ncbi_df['gemini_pro_five_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gemini_pro_five_shot')
_, ncbi_df['gemini_pro_ten_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gemini_pro_ten_shot')
_, ncbi_df['gemini_pro_twenty_shot_labels'] = html_parsing_ncbi(ncbi_df, 'html_gemini_pro_twenty_shot')

In [59]:
print(f"F1-Score One Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'gemini_pro_one_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1-Score Five Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'gemini_pro_five_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1-Score Ten Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'gemini_pro_ten_shot_labels', 'strict')['default']['f1-score']}")
print(f"F1-Score Twenty Shot (Strict): {get_classification_report(ncbi_df, 'gt_labels', 'gemini_pro_twenty_shot_labels', 'strict')['default']['f1-score']}")

F1-Score One Shot (Strict): 0.5240083507306891
F1-Score Five Shot (Strict): 0.5077881619937694
F1-Score Ten Shot (Strict): 0.5686274509803921
F1-Score Twenty Shot (Strict): 0.6544342507645259


In [60]:
print(f"F1-Score One Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'gemini_pro_one_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1-Score Five Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'gemini_pro_five_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1-Score Ten Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'gemini_pro_ten_shot_labels', 'lenient')['default']['f1-score']}")
print(f"F1-Score Twenty Shot (Lenient): {get_classification_report(ncbi_df, 'gt_labels', 'gemini_pro_twenty_shot_labels', 'lenient')['default']['f1-score']}")

F1-Score One Shot (Lenient): 0.6555323590814196
F1-Score Five Shot (Lenient): 0.616822429906542
F1-Score Ten Shot (Lenient): 0.6503267973856209
F1-Score Twenty Shot (Lenient): 0.7798165137614678


In [61]:
print(f"Average Gemini Pro one-shot prediction time: {ncbi_df['gemini_pro_one_shot_time'].mean():.2f} seconds")
print(f"Average Gemini Pro five-shot prediction time: {ncbi_df['gemini_pro_five_shot_time'].mean():.2f} seconds")
print(f"Average Gemini Pro ten-shot prediction time: {ncbi_df['gemini_pro_ten_shot_time'].mean():.2f} seconds")
print(f"Average Gemini Pro twenty-shot prediction time: {ncbi_df['gemini_pro_twenty_shot_time'].mean():.2f} seconds")

Average Gemini Pro one-shot prediction time: 1.40 seconds
Average Gemini Pro five-shot prediction time: 1.11 seconds
Average Gemini Pro ten-shot prediction time: 1.19 seconds
Average Gemini Pro twenty-shot prediction time: 1.27 seconds


In [62]:
# save the inference results
ncbi_df.to_csv('data/NER/NCBI-disease/test_200_gemini_pro_results.csv', index=False)

# 1.2 2018 n2c2 Dataset

### 1.2.1 Inference

In [63]:
n2c2_df = pd.read_csv('data/NER/2018_n2c2/test_200.csv')
n2c2_example_df = pd.read_csv('data/NER/2018_n2c2/examples.csv')

In [65]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to extract disease entities in a sentence. The entity type includes Form, Route, Frequency, Dosage, Strength, Duration, Reason, Ade, Drug."
"INPUT: the input is a sentence."
"OUTPUT: the output is an HTML that highlights all the disease entities in the sentence in different colors: Form(#FF0000), Route(#FFA500), Frequency(#FFFF00), Dosage(#00FF00), Strength(#0000FF), Duration(#800080), Reason(#FFC0CB), Ade(#964B00), Drug(#808080) in hex code. The highlighting should only use HTML tags <span style=\"background-color: #XXXXXX\"> and </span> and no other tags."
"""
def get_ner_2018_n2c2(sentence: str, shot: int = 0) -> str:
    """
    Get the NER results of 2018 n2c2 dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the NER results
    """
    
    # system prompt
    history = [
        Content(role="user", parts=[Part.from_text(system_message)]),
        Content(role="model", parts=[Part.from_text("Understood.")])
    ]
    for i in range(shot):
        history.extend([
            Content(role="user", parts=[Part.from_text(n2c2_example_df.iloc[i]['text'])]),
            Content(role="model", parts=[Part.from_text(n2c2_example_df.iloc[i]['label_text'])]) 
        ])

    # Init a new chat session
    chat = chat_model.start_chat(history=history)

    time_start = time.time()
    response = chat.send_message(
        sentence,
        generation_config=config,
        safety_settings=safety_settings,
        stream=False,
    )
    time_end = time.time()

    return response.text, time_end - time_start

In [None]:
for i in tqdm(range(0, len(n2c2_df), 1)):
    if (i + 1) % 15 == 0: # Gemini API has a quota limit
        time.sleep(65)
    n2c2_df.loc[i, 'html_gemini_pro_one_shot'], n2c2_df.loc[i, 'gemini_pro_one_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 1)
    n2c2_df.loc[i, 'html_gemini_pro_five_shot'], n2c2_df.loc[i, 'gemini_pro_five_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 5)
    n2c2_df.loc[i, 'html_gemini_pro_ten_shot'], n2c2_df.loc[i, 'gemini_pro_ten_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 10)
    n2c2_df.loc[i, 'html_gemini_pro_twenty_shot'], n2c2_df.loc[i, 'gemini_pro_twenty_shot_time'] = get_ner_2018_n2c2(n2c2_df.loc[i, 'text'], 20)

### 1.2.2 Evaluation

In [None]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# n2c2_df = pd.read_csv("data/NER/2018_n2c2/test_200_gemini_pro_results.csv")

In [67]:
n2c2_df['gt_labels'], n2c2_df['genimi_pro_one_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gemini_pro_one_shot')
_, n2c2_df['gemini_pro_five_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gemini_pro_five_shot')
_, n2c2_df['gemini_pro_ten_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gemini_pro_ten_shot')
_, n2c2_df['gemini_pro_twenty_shot_labels'] = html_parsing_n2c2(n2c2_df, 'html_gemini_pro_twenty_shot')

In [78]:
print(f"F1 Score One Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'genimi_pro_one_shot_labels', 'strict'))}")
print(f"F1 Score Five Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gemini_pro_five_shot_labels', 'strict'))}")
print(f"F1 Score Ten Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gemini_pro_ten_shot_labels', 'strict'))}")
print(f"F1 Score Twenty Shot (Strict): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gemini_pro_twenty_shot_labels', 'strict'))}")

F1 Score One Shot (Strict): 0.23317316663015586
F1 Score Five Shot (Strict): 0.4421635785671199
F1 Score Ten Shot (Strict): 0.5049045701653464
F1 Score Twenty Shot (Strict): 0.5661715078264877


In [79]:
print(f"F1 Score One Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'genimi_pro_one_shot_labels', 'lenient'))}")
print(f"F1 Score Five Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gemini_pro_five_shot_labels', 'lenient'))}")
print(f"F1 Score Ten Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gemini_pro_ten_shot_labels', 'lenient'))}")
print(f"F1 Score Twenty Shot (Lenient): {get_macro_average_f1(get_classification_report(n2c2_df, 'gt_labels', 'gemini_pro_twenty_shot_labels', 'lenient'))}")

F1 Score One Shot (Lenient): 0.3443192195776736
F1 Score Five Shot (Lenient): 0.5711306844055251
F1 Score Ten Shot (Lenient): 0.6482713143251391
F1 Score Twenty Shot (Lenient): 0.694603986327699


In [80]:
print(f"Average Gemini Pro one-shot prediction time: {n2c2_df['gemini_pro_one_shot_time'].mean():.2f} seconds")
print(f"Average Gemini Pro five-shot prediction time: {n2c2_df['gemini_pro_five_shot_time'].mean():.2f} seconds")
print(f"Average Gemini Pro ten-shot prediction time: {n2c2_df['gemini_pro_ten_shot_time'].mean():.2f} seconds")
print(f"Average Gemini Pro twenty-shot prediction time: {n2c2_df['gemini_pro_twenty_shot_time'].mean():.2f} seconds")

Average Gemini Pro one-shot prediction time: 1.73 seconds
Average Gemini Pro five-shot prediction time: 2.30 seconds
Average Gemini Pro ten-shot prediction time: 2.11 seconds
Average Gemini Pro twenty-shot prediction time: 2.85 seconds


In [81]:
# save the inference results
n2c2_df.to_csv('data/NER/2018_n2c2/test_200_gemini_pro_results.csv', index=False)

# 2. RE (Relation Extraction)

## 2.1 2018 n2c2 Dataset

### 2.1.1 Infernece

In [83]:
n2c2_df = pd.read_csv('data/RE/2018_n2c2/test_200.csv')
n2c2_example_df = pd.read_csv('data/RE/2018_n2c2/examples.csv')

In [85]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to classify relations for a sentence."
"INPUT: the input is a sentence where the entities are labeled within [E${X}] and [E${X}/] in a sentence, where X is an integer representing an unique entity."
"OUTPUT: your task is to select one out of the nine types of relations ('STRENGTH-DRUG', 'ROUTE-DRUG', 'FREQUENCY-DRUG', 'FORM-DRUG', 'DOSAGE-DRUG', 'REASON-DRUG', 'DURATION-DRUG', 'ADE-DRUG', and 'No relation')."
"""
def get_re_2018_n2c2(sentence: str, shot: int = 0) -> str:
    """
    Get the RE results of 2018 n2c2 dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the RE results
    """
    
    # system prompt
    history = [
        Content(role="user", parts=[Part.from_text(system_message)]),
        Content(role="model", parts=[Part.from_text("Understood.")])
    ]
    for i in range(shot):
        history.extend([
            Content(role="user", parts=[Part.from_text(n2c2_example_df.iloc[i]['text'])]),
            Content(role="model", parts=[Part.from_text(n2c2_example_df.iloc[i]['labels'])]) 
        ])

    # Init a new chat session
    chat = chat_model.start_chat(history=history)

    time_start = time.time()
    response = chat.send_message(
        sentence,
        generation_config=config,
        safety_settings=safety_settings,
        stream=False,
    )
    time_end = time.time()

    return response.text, time_end - time_start

In [None]:
for i in tqdm(range(0, len(n2c2_df), 1)):
    if (i + 1) % 15 == 0: # Gemini API has a quota limit
        time.sleep(65)
    n2c2_df.loc[i, 'gemini_pro_one_shot'], n2c2_df.loc[i, 'gemini_pro_one_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 1)
    n2c2_df.loc[i, 'gemini_pro_five_shot'], n2c2_df.loc[i, 'gemini_pro_five_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 5)
    n2c2_df.loc[i, 'gemini_pro_ten_shot'], n2c2_df.loc[i, 'gemini_pro_ten_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 10)
    n2c2_df.loc[i, 'gemini_pro_twenty_shot'], n2c2_df.loc[i, 'gemini_pro_twenty_shot_time'] = get_re_2018_n2c2(n2c2_df.loc[i, 'text'], 20)

### 2.1.2 Evaluation

In [93]:
# get rid of ' ' if any
n2c2_df['gemini_pro_one_shot'] = n2c2_df['gemini_pro_one_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['gemini_pro_five_shot'] = n2c2_df['gemini_pro_five_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['gemini_pro_ten_shot'] = n2c2_df['gemini_pro_ten_shot'].apply(lambda x: x[1:-1] if "'" in x else x)
n2c2_df['gemini_pro_twenty_shot'] = n2c2_df['gemini_pro_twenty_shot'].apply(lambda x: x[1:-1] if "'" in x else x)

In [94]:
# get digit label while considering failed LLM outputs as 'No relation'
n2c2_df['labels'] = n2c2_df['labels'].apply(get_digit)
n2c2_df['gemini_pro_one_shot_labels'] = n2c2_df['gemini_pro_one_shot'].apply(get_digit)
n2c2_df['gemini_pro_five_shot_labels'] = n2c2_df['gemini_pro_five_shot'].apply(get_digit)
n2c2_df['gemini_pro_ten_shot_labels'] = n2c2_df['gemini_pro_ten_shot'].apply(get_digit)
n2c2_df['gemini_pro_twenty_shot_labels'] = n2c2_df['gemini_pro_twenty_shot'].apply(get_digit)

In [95]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# n2c2_df = pd.read_csv("data/RE/2018_n2c2/test_200_gemini_pro_results.csv")

In [112]:
y_true = n2c2_df['labels'].tolist()
y_pred = n2c2_df['gemini_pro_one_shot_labels'].tolist()
print(f"F1 Score One Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['gemini_pro_five_shot_labels'].tolist()
print(f"F1 Score Five Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['gemini_pro_ten_shot_labels'].tolist()
print(f"F1 Score Ten Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = n2c2_df['gemini_pro_twenty_shot_labels'].tolist()
print(f"F1 Score Twenty Shot: {f1_score(y_true, y_pred, average='macro')}")

F1 Score One Shot: 0.21323799601174073
F1 Score Five Shot: 0.2757285615539049
F1 Score Ten Shot: 0.30601313216234705
F1 Score Twenty Shot: 0.4119489003723716


In [113]:
print(f"Average Gemini Pro one-shot prediction time: {n2c2_df['gemini_pro_one_shot_time'].mean():.2f} seconds")
print(f"Average Gemini Pro five-shot prediction time: {n2c2_df['gemini_pro_five_shot_time'].mean():.2f} seconds")
print(f"Average Gemini Pro ten-shot prediction time: {n2c2_df['gemini_pro_ten_shot_time'].mean():.2f} seconds")
print(f"Average Gemini Pro twenty-shot prediction time: {n2c2_df['gemini_pro_twenty_shot_time'].mean():.2f} seconds")

Average Gemini Pro one-shot prediction time: 0.40 seconds
Average Gemini Pro five-shot prediction time: 0.44 seconds
Average Gemini Pro ten-shot prediction time: 0.46 seconds
Average Gemini Pro twenty-shot prediction time: 0.55 seconds


In [114]:
# save the inference results
n2c2_df.to_csv('data/RE/2018_n2c2/test_200_gemini_pro_results.csv', index=False)

## 2.2 GAD

### 2.2.1 Inference

In [134]:
gad_df = pd.read_csv('data/RE/GAD/test_200.csv')
gad_example_df = pd.read_csv('data/RE/GAD/examples.csv')

In [135]:
system_message = """You are a helpful assistant to perform the following task.
"TASK: the task is to classify relations between a disease and a gene for a sentence."
"INPUT: the input is a sentence where the disease is labeled as @DISEASE$ and the gene is labeled as @GENE$ accordingly in a sentence. "
"OUTPUT: your task is to select one out of the two types of relations (0 and 1) for the gene and disease without any explanation or other characters: 
0, no relations 
1, has relations"
"""
def get_re_gad(sentence: str, shot: int = 0) -> str:
    """
    Get the RE results of GAD dataset from few-shot prompting.
    Args:
        sentence: the input sentence
        shot: the number of few-shot examples
    Returns:
        the RE results
    """

    # system prompt
    history = [
        Content(role="user", parts=[Part.from_text(system_message)]),
        Content(role="model", parts=[Part.from_text("Understood.")])
    ]
    for i in range(shot):
        history.extend([
            Content(role="user", parts=[Part.from_text(gad_example_df.iloc[i]['text'])]),
            Content(role="model", parts=[Part.from_text(str(gad_example_df.iloc[i]['labels']))]) 
        ])

    # Init a new chat session
    chat = chat_model.start_chat(history=history)

    time_start = time.time()
    response = chat.send_message(
        sentence,
        generation_config=config,
        safety_settings=safety_settings,
        stream=False,
    )
    time_end = time.time()

    return response.text, time_end - time_start

In [None]:
for i in tqdm(range(0, len(gad_df), 1)):
    if (i + 1) % 15 == 0: # Gemini API has a quota limit
        time.sleep(65)
    gad_df.loc[i, 'gemini_pro_one_shot'], gad_df.loc[i, 'gemini_pro_one_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 1)
    gad_df.loc[i, 'gemini_pro_five_shot'], gad_df.loc[i, 'gemini_pro_five_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 5)
    gad_df.loc[i, 'gemini_pro_ten_shot'], gad_df.loc[i, 'gemini_pro_ten_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 10)
    gad_df.loc[i, 'gemini_pro_twenty_shot'], gad_df.loc[i, 'gemini_pro_twenty_shot_time'] = get_re_gad(gad_df.iloc[i]['text'], 20)

### 2.2.2 Evaluation

In [125]:
# convert some strings to int while considering failed LLM outputs as 'No relation (0)'
gad_df['gemini_pro_one_shot'] = gad_df['gemini_pro_one_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['gemini_pro_five_shot'] = gad_df['gemini_pro_five_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['gemini_pro_ten_shot'] = gad_df['gemini_pro_ten_shot'].apply(lambda x: int(x) if x.isdigit() else 0)
gad_df['gemini_pro_twenty_shot'] = gad_df['gemini_pro_twenty_shot'].apply(lambda x: int(x) if x.isdigit() else 0)

In [126]:
# Optional: you can just load the llm output from the csv file instead of running the above code
# gad_df = pd.read_csv("data/RE/GAD/test_200_gemini_pro_results.csv")

In [127]:
y_true = gad_df['labels'].tolist()
y_pred = gad_df['gemini_pro_one_shot'].tolist()
print(f"F1 Score One Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['gemini_pro_five_shot'].tolist()
print(f"F1 Score Five Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['gemini_pro_ten_shot'].tolist()
print(f"F1 Score Ten Shot: {f1_score(y_true, y_pred, average='macro')}")
y_pred = gad_df['gemini_pro_twenty_shot'].tolist()
print(f"F1 Score Twenty Shot: {f1_score(y_true, y_pred, average='macro')}")

F1 Score One Shot: 0.4600403659143928
F1 Score Five Shot: 0.4615140956604371
F1 Score Ten Shot: 0.5416887008637405
F1 Score Twenty Shot: 0.513307205614898


In [128]:
print(f"Average Gemini Pro one-shot prediction time: {gad_df['gemini_pro_one_shot_time'].mean():.2f} seconds")
print(f"Average Gemini Pro five-shot prediction time: {gad_df['gemini_pro_five_shot_time'].mean():.2f} seconds")
print(f"Average Gemini Pro ten-shot prediction time: {gad_df['gemini_pro_ten_shot_time'].mean():.2f} seconds")
print(f"Average Gemini Pro twenty-shot prediction time: {gad_df['gemini_pro_twenty_shot_time'].mean():.2f} seconds")

Average Gemini Pro one-shot prediction time: 0.34 seconds
Average Gemini Pro five-shot prediction time: 0.33 seconds
Average Gemini Pro ten-shot prediction time: 0.40 seconds
Average Gemini Pro twenty-shot prediction time: 0.50 seconds


In [129]:
# save the inference results
gad_df.to_csv('data/RE/GAD/test_200_gemini_pro_results.csv', index=False)