# Instantiate model and datasets

In [5]:
import os
import pandas as pd
from openai import OpenAI

# Set up OpenAI client
os.environ["OPENAI_API_KEY"] = "fill in API key"

# Define model configurations
base_model_name = "gpt-4o-mini-2024-07-18"

# Load datasets directly from parquet files
# Assuming these are the paths to your parquet files - adjust as needed
sept_news_text_df = pd.read_csv("datasets/latest_news/latest_news_memorization.csv")
sept_news_text_rephrased_df = pd.read_csv(
    "datasets/latest_news/latest_news_rephrased.csv"
)
sept_news_text_date_changed_df = pd.read_csv(
    "datasets/latest_news/latest_news_date_changed.csv"
)

### Subset datasets where used_in_analysis is True
sept_news_text_df = sept_news_text_df[sept_news_text_df["used_in_analysis"] == True]
sept_news_text_rephrased_df = sept_news_text_rephrased_df[
    sept_news_text_rephrased_df["used_in_analysis"] == True
]
sept_news_text_date_changed_df = sept_news_text_date_changed_df[
    sept_news_text_date_changed_df["used_in_analysis"] == True
]

# Evaluate baseline model

In [9]:
import os
import pandas as pd
from openai import OpenAI
from tqdm import tqdm

# Initialize OpenAI client
client = OpenAI()


def evaluate_dataset(df, model_name, temperature=0, max_tokens=100):
    """
    Evaluate a dataset using OpenAI API
    """
    results = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating"):
        messages = []
        messages.append({"role": "user", "content": row["prompt"]})

        try:
            response = client.chat.completions.create(
                model=model_name,
                messages=messages,
                temperature=temperature,
                max_tokens=max_tokens,
                seed=42,
            )
            answer = response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error during API call: {e}")
            answer = "<error>"

        result = {
            "prompt": row["prompt"],
            "raw_response": answer,
            "answer": row["answer"],
            "model": model_name,
        }
        results.append(result)

    return pd.DataFrame(results)


# Evaluate each dataset
datasets = {
    "original": sept_news_text_df,
    "rephrased": sept_news_text_rephrased_df,
    "date_changed": sept_news_text_date_changed_df,
}

results = {}
for name, df in datasets.items():
    print(f"\nEvaluating {name} dataset...")
    results[name] = evaluate_dataset(
        df=df, model_name=base_model_name, temperature=0, max_tokens=100
    )


Evaluating original dataset...


Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating: 100%|██████████| 50/50 [00:23<00:00,  2.11it/s]



Evaluating rephrased dataset...


Evaluating: 100%|██████████| 50/50 [01:17<00:00,  1.54s/it]



Evaluating date_changed dataset...


Evaluating: 100%|██████████| 50/50 [00:22<00:00,  2.22it/s]


## Calculate accuracy

### Exact method

In [None]:
# Calculate accuracy for each dataset
accuracy_results = []

for dataset_name, df in results.items():
    # Calculate exact match accuracy
    accuracy = (
        df["raw_response"].astype(str).str.lower()
        == df["answer"].astype(str).str.lower()
    ).mean()

    accuracy_results.append(
        {"model": base_model_name, "dataset": dataset_name, "accuracy": accuracy}
    )

# Create accuracy DataFrame
accuracy_df = pd.DataFrame(accuracy_results)
accuracy_df