### Setup

In [2]:
import sys
sys.path.append("../")  # Add parent directory to Python path
from models.llm import *

llms = [
    ModelConfig("gpt-4o-mini", "openai", os.environ["OPENAI_API_KEY"], "https://api.openai.com/v1"),
]

### Load data

In [4]:
import pandas as pd

df = pd.read_csv("../../outputs/value_action_generation.csv")
values = df["value"].unique().tolist()

### TASK 1
Given the scenario, identify the relevant values

In [None]:
# TODO: add compatibility with multiple prompt strategies


def evaluate_scenario_task1(row, model, value):
    polarity = row["polarity"]
    scenario = row["scenario"]
    prompt = f"""
    Is the following scenario a {polarity} example of {value}?
    Scenario: {scenario}
    """
    hist = [{"role": "user", "content": prompt}]
    ans, _ = model.chat(hist, gen_conf={})
    return {
        "model": model.name,
        "ground_truth": row["ground_truth"],
        "value": value,
        "polarity": polarity,
        "ans": ans
    }

def score_task1(row, ans):
    if row["ground_truth"] == (row["value"], row["polarity"]) and ans == "yes":
        return "tp"
    elif row["ground_truth"] != (row["value"], row["polarity"]) and ans == "no":
        return "tn"
    elif row["ground_truth"] == (row["value"], row["polarity"]) and ans == "no":
        return "fp"
    elif row["ground_truth"] != (row["value"], row["polarity"]) and ans == "yes":
        return "fn"
    else:
        return "other"


res = [
    evaluate_scenario_task1(row, model, value)
    for _, row in df.iterrows()
    for model in llms
    for value in values
]
df_task1 = pd.DataFrame(res)
df_task1.to_csv("data/task1_results.csv", index=False)

In [None]:
df_task1["score"] = df_task1.apply(lambda row: score_task1(row, row["ans"]), axis=1)
df_task1.to_csv("data/task1_results.csv", index=False)

# F1 score
def f1_score(df):
    tp = df[df["score"] == "tp"].shape[0]
    tn = df[df["score"] == "tn"].shape[0]
    fp = df[df["score"] == "fp"].shape[0]
    fn = df[df["score"] == "fn"].shape[0]
    return tp / (tp + 0.5 * (fp + fn))

res = {}
# extend to include different prompt strategies
for model in llms:
    res[model.name] = f1_score(df_task1[df_task1["model"] == model.name])

print(res)

df_f1 = pd.DataFrame(res.items(), columns=["model", "f1"])
df_f1.to_csv("data/task1_f1.csv", index=False)

### TASK 2
Given the scenario and a value, identify the spans of text that support the value

In [None]:
def create_prompt_task2(scenario, value):
    return f"""
    Which text spans support the value "{value}"?
    Scenario: {scenario}
    """

def evaluate_scenario_task2(row, model, value):
    prompt = create_prompt_task2(row["scenario"], value)
    hist = [{"role": "user", "content": prompt}]
    ans, _ = model.chat(hist, gen_conf={})
    return {
        "model": model.name,
        "value": value,
        "ans": ans
    }

def score_task2(row, ans):
    return "not implemented"

res = [
    evaluate_scenario_task2(row, model, value)
    for _, row in df.iterrows()
    for model in llms
    for value in values
]

df_task2 = pd.DataFrame(res)
df_task2.to_csv("data/task2_results.csv", index=False)

### TASK 3
???