In [None]:
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import FaithfulnessMetric
import deepeval

import os
openai_token = "YOUR OPENAI TOKEN"
os.environ["OPENAI_API_KEY"]=openai_token

deepeval.login_with_confident_api_key("YOUR DEEPEVAL TOKEN")

In [None]:
def flatten(xss):
    return [x for xs in xss for x in xs]

def remove_thoughts_section(text):
    return flatten([e.split("</think>") for e in text.split("</thoughts>")])[-1]

In [None]:
import statistics
from deepeval import evaluate
from deepeval.dataset import EvaluationDataset
from datasets import Dataset

faithfulness_metric = FaithfulnessMetric(
    include_reason=True,
    #model="gpt-3.5-turbo"
)

usedDataset = "deepseek_COT_raft"
dsBaseDir = f"YOUR_HOME_DIR/datasets/evaluation/{usedDataset}/"

models = [f for f in os.listdir(dsBaseDir) if os.path.isdir(dsBaseDir + f)]

allResults = {}

IncludeThoughts = False
print(models)

In [None]:
models = ['qwen-32b-basicDS', 'meditron-7b-fp16', 'meditron-7b-basicDS', 'llama31-basicDS', 'llama31-advDS-COT', 'llama3.1-8b-instruct-fp16', 'qwen-32b-COT-q4_K_M']

In [None]:
usedModel = models[4]
print("Evaluating Faithfullness for", usedModel)
datasetDir = dsBaseDir + usedModel

ds = Dataset.load_from_disk(datasetDir)

testcases = []
for e in ds:
    if IncludeThoughts:
        actual_output=e["actual_output"]
        expected_output=e["expected_output"]
    else:
        actual_output=remove_thoughts_section(e["actual_output"])
        expected_output=remove_thoughts_section(e["expected_output"])
    test_case = LLMTestCase(input=e["input"], actual_output=actual_output, expected_output=expected_output, retrieval_context=e["retrieval_context"])
    testcases.append(test_case)
id = f"new-{usedModel}-{usedDataset}-{'WithThoughts' if IncludeThoughts else 'WithoutThoughts'}"
dataset = EvaluationDataset(test_cases=testcases)

try:
    evalRes = evaluate(dataset, metrics=[faithfulness_metric], throttle_value=30, identifier=id, print_results=False)
    allResults[usedModel] = evalRes
except:
    print("error")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches

#models = []
#for model_name in allResults.keys():
#    m = model_name
#    e = allResults[m]
#    scores = []
#    for r in e.test_results:
#        scores.append(r.metrics_data[0].score)
#    avg = round(statistics.fmean(scores), 2)
#    print(model_name)
#    print("Avg:", avg)
#    models[m] = avg

#print(models)
# Extracting values
models = {'llama3.1-8b-instruct-fp16':0.96, 'meditron-7b-fp16':0.99, 'DeepSeek-R1-Distill-Qwen-32B':0.94, 'llama31-basicDS':0.98, 'meditron-7b-basicDS':0.95, 'qwen-32b-basicDS':0.94, 'llama31-advDS-COT':0.94, 'meditron-7b-COT':0.94, 'qwen-32b-COT-q4_K_M':0.95}
colors = ['#1b9e77'] * 3 + ['#d95f02'] * 3 + ['#7570b3'] * 3

print(models)
avgs = [models[m] for m in models.keys()]

# Plotting
x = np.arange(len(models))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(x, avgs, width, label='Average Faithfulness Score', color=colors)

# Labels & Titles
ax.set_xlabel("Models")
ax.set_ylabel("Score")
ax.set_title("Average Faithfulness Score for Different Models")
ax.set_xticks(x)
ax.set_xticklabels(models, rotation=45, ha="right")

additionalY = 0.3
ax.axvline(x = 2.5, color = '0.5')
ax.axvline(x = 5.5, color = '0.5')
ax.text((-0.65 + 2.5)/3, additionalY, 'Base Models', style='italic', bbox={'facecolor': '0.8', 'alpha': 1, 'pad': 5})
ax.text((-1.3 + 2.5)/3 + 3, additionalY, 'Basic Dataset Models', style='italic', bbox={'facecolor': '0.8', 'alpha': 1, 'pad': 5})
ax.text((-1.3 + 2.5)/3 + 6, additionalY, 'Advanced Dataset Models', style='italic', bbox={'facecolor': '0.8', 'alpha': 1, 'pad': 5})

for i, m in enumerate(models.keys()):
    ax.text(i-(0.5*width),models[m],models[m])

base_model_patch = mpatches.Patch(color='#1b9e77', label='Base Model')
basic_ds_patch = mpatches.Patch(color='#d95f02', label='Basic Dataset')
adv_ds_patch = mpatches.Patch(color='#7570b3', label='Advanced Dataset')
ax.legend(handles=[base_model_patch, basic_ds_patch, adv_ds_patch], loc="lower center")

plt.ylim(0, 1)
plt.tight_layout()
plt.savefig("YOUR_HOME_DIR/datasets/evaluation/deepseek_COT_raft/Faithfullness_Score_AVG.png")
plt.show()


In [None]:
for model_name in allResults.keys():
    m = model_name
    e = allResults[m]
    scores = []
    for r in e.test_results:
        scores.append(r.metrics_data[0].score)
    avg = round(statistics.fmean(scores), 2)
    print(model_name)
    print(e.confident_link)
    print("Avg:", avg)

In [None]:
#hengwen-DeepSeek-R1-Distill-Qwen-32B-q4_k_m
#Avg: 0.94
#https://app.confident-ai.com/project/cm8948kaq1y5bh4ws6cui71c1/evaluation/test-runs/cm8hfqlb10vnzxnpht3t9so24

#meditron-7b-COT
#Avg: 0.94
#https://app.confident-ai.com/project/cm8948kaq1y5bh4ws6cui71c1/evaluation/test-runs/cm8hho3aj16dixnphkjp3hq7f