In [None]:
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval
import deepeval

import os
openai_token = "YOUR OPENAI TOKEN"
os.environ["OPENAI_API_KEY"]=openai_token

deepeval.login_with_confident_api_key("YOUR DEEPEVAL TOKEN")

In [None]:
def flatten(xss):
    return [x for xs in xss for x in xs]

def remove_thoughts_section(text):
    return flatten([e.split("</think>") for e in text.split("</thoughts>")])[-1]

In [None]:
import statistics
from deepeval import evaluate
from deepeval.dataset import EvaluationDataset
from datasets import Dataset

correctness_metric = GEval(
        name="Correctness",
        criteria="Determine if the 'actual output' is correct based on the 'expected output'.",
        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT]
    )

usedDataset = "deepseek_COT_raft"
dsBaseDir = f"YOUR_HOME_DIR/datasets/evaluation/{usedDataset}/"

models = [f for f in os.listdir(dsBaseDir) if os.path.isdir(dsBaseDir + f)]

allResults = {}

IncludeThoughts = False
for idx, m in enumerate(models):
    print(idx, m)

In [None]:
usedModel = models[8]

print("Evaluating Correctness for", usedModel)
datasetDir = dsBaseDir + usedModel

ds = Dataset.load_from_disk(datasetDir)

testcases = []
for e in ds:
    if IncludeThoughts:
        actual_output=e["actual_output"]
        expected_output=e["expected_output"]
    else:
        actual_output=remove_thoughts_section(e["actual_output"])
        expected_output=remove_thoughts_section(e["expected_output"])
    test_case = LLMTestCase(input=e["input"], actual_output=actual_output, expected_output=expected_output, retrieval_context=e["retrieval_context"])
    testcases.append(test_case)
id = f"new-{usedModel}-{usedDataset}-{'WithThoughts' if IncludeThoughts else 'WithoutThoughts'}"
dataset = EvaluationDataset(test_cases=testcases)

evalRes = evaluate(dataset, metrics=[correctness_metric], throttle_value=2, identifier=id, print_results=False)
allResults[usedModel] = evalRes


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches

# Extracting values
models = {'llama3.1-8b-instruct-fp16':0.53, 'meditron-7b-fp16':0.14, 'DeepSeek-R1-Distill-Qwen-32B':0.45, 'llama31-basicDS':0.3, 'meditron-7b-basicDS':0.23, 'qwen-32b-basicDS':0.32, 'llama31-advDS-COT':0.5, 'meditron-7b-COT':0.36, 'qwen-32b-COT-q4_K_M':0.58}
colors = ['#1b9e77'] * 3 + ['#d95f02'] * 3 + ['#7570b3'] * 3

print(models)
avgs = [models[m] for m in models.keys()]

# Plotting
x = np.arange(len(models))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(x, avgs, width, label='Average Correctness Score', color=colors)

# Labels & Titles
ax.set_xlabel("Models")
ax.set_ylabel("Score")
ax.set_title("Average Correctness Score for Different Models")
ax.set_xticks(x)
ax.set_xticklabels(models, rotation=45, ha="right")

for i in range(len(x)):
    ax.text(i-(0.5*width),avgs[i],avgs[i])

ax.set_ylim(0, 0.65)
# Create a break in the y-axis
ax.set_yticks([0, 0.2, 0.4, 0.6, 0.66])
ax.set_yticklabels(["0", "0.2", "0.4", "0.6", "1"])
ax.spines['top'].set_visible(False)
# Add a break indicator (zigzag lines)
ax.plot([-0.5, 8.75], [0.65, 0.66], "k--", lw=1)
ax.plot([-0.5, 8.75], [0.6, 0.6], "k--", lw=1)

base_model_patch = mpatches.Patch(color='#1b9e77', label='Base Model')
basic_ds_patch = mpatches.Patch(color='#d95f02', label='Basic Dataset')
adv_ds_patch = mpatches.Patch(color='#7570b3', label='Advanced Dataset')
ax.legend(handles=[base_model_patch, basic_ds_patch, adv_ds_patch], loc="upper center")

plt.tight_layout()
plt.savefig("YOUR_HOME_DIR/datasets/evaluation/deepseek_COT_raft/Correctness_Score_AVG.png")
plt.show()
