In [None]:
import os
hf_token = "HUGGINGFACE TOKEN HERE"
os.environ["HUGGINGFACEHUB_API_TOKEN"]=hf_token
os.environ["HF_TOKEN"]=hf_token
os.environ['HF_HOME'] = 'YOUR_HOME_DIR/.cache/huggingface/'
os.environ['TRANSFOMERS_CACHE'] = 'YOUR_HOME_DIR/.cache/huggingface/'

In [None]:
from datasets import Dataset
import re

def flatten(xss):
    return [x for xs in xss for x in xs]

def remove_thoughts_section(text):
    return flatten([e.split("</think>") for e in text.split("</thoughts>")])[-1]

In [None]:
from deepeval.scorer import Scorer
import statistics

usedDataset = "deepseek_COT_raft"
dsBaseDir = f"YOUR_HOME_DIR/datasets/evaluation/{usedDataset}/"

models = [f for f in os.listdir(dsBaseDir)]

allResults = {}

for usedModel in models:
    print("Calculating BERTScore for ", usedModel)
    datasetDir = dsBaseDir + usedModel
    
    ds = Dataset.load_from_disk(datasetDir)
    
    scorer = Scorer()
    results = {}
    data = []
    for e in ds:
        bertScore = scorer.bert_score(
            references=remove_thoughts_section(e["expected_output"]),
            predictions=remove_thoughts_section(e["actual_output"]),
            #model="microsoft/deberta-xlarge-mnli"
        )
        data.append(bertScore)
        for k in bertScore.keys():
            if k not in results.keys():
                results[k] = []
            results[k] = results[k] + bertScore[k]
    
    for k in results.keys():
        print(f"{k}: {statistics.fmean(results[k])}")

    allResults[usedModel] = results

In [None]:
#llama31-advDS-COT
#bert-precision: 0.35718521179289237
#bert-recall: 0.2912410410235123
#bert-f1: 0.32043151276148096

#meditron-7b-COT
#bert-precision: 0.29772718418009425
#bert-recall: 0.1973740966344366
#bert-f1: 0.24325300633375133

#qwen-32b-COT-q4_K_M
#bert-precision: 0.25752965741011563
#bert-recall: 0.32418028234827273
#bert-f1: 0.29016731679439545

In [None]:
allAverages = {model:{k:statistics.fmean(allResults[model][k]) for k in allResults[model].keys()} for model in allResults.keys()}
print(allAverages)

In [None]:
allAverages = {'meditron-7b-COT': {'bert-precision': 0.3047684439058815, 'bert-recall': 0.21074126485488093, 'bert-f1': 0.2503270188688624}, 'qwen-32b-basicDS': {'bert-precision': 0.4318103810810313, 'bert-recall': 0.12030459529416142, 'bert-f1': 0.2454918550745565}, 'meditron-7b-fp16': {'bert-precision': 0.03413265044515838, 'bert-recall': -0.021827182580469822, 'bert-f1': -0.003499159099990312}, 'meditron-7b-basicDS': {'bert-precision': 0.4151772189961404, 'bert-recall': 0.03516647587435282, 'bert-f1': 0.1870286071064825}, 'llama31-basicDS': {'bert-precision': 0.4348166682282273, 'bert-recall': 0.035279964039824446, 'bert-f1': 0.1904552257382216}, 'llama31-advDS-COT': {'bert-precision': 0.40585619758586494, 'bert-recall': 0.30333154602926604, 'bert-f1': 0.34611568417476146}, 'llama3.1-8b-instruct-fp16': {'bert-precision': 0.26908153483206976, 'bert-recall': 0.25759676277485427, 'bert-f1': 0.2573016523098459}, 'hengwen-DeepSeek-R1-Distill-Qwen-32B-q4_k_m': {'bert-precision': 0.1308120781821864, 'bert-recall': 0.26566630444781647, 'bert-f1': 0.18692373169338977}, 'qwen-32b-COT-q4_K_M': {'bert-precision': 0.11906920963119032, 'bert-recall': 0.3157404516728557, 'bert-f1': 0.20588080978439172}}

In [None]:
allAverages["DeepSeek-R1-Distill-Qwen-32B"] = allAverages["hengwen-DeepSeek-R1-Distill-Qwen-32B-q4_k_m"]
del allAverages["hengwen-DeepSeek-R1-Distill-Qwen-32B-q4_k_m"]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extracting values
models = ['llama3.1-8b-instruct-fp16', 'meditron-7b-fp16', 'DeepSeek-R1-Distill-Qwen-32B', 'llama31-basicDS', 'meditron-7b-basicDS', 'qwen-32b-basicDS', 'llama31-advDS-COT', 'meditron-7b-COT', 'qwen-32b-COT-q4_K_M']

print(models)
precision = [allAverages[m]['bert-precision'] for m in models]
recall = [allAverages[m]['bert-recall'] for m in models]
f1 = [allAverages[m]['bert-f1'] for m in models]

# Plotting
x = np.arange(len(models))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(x - width, precision, width, label='BERT Precision')
ax.bar(x, recall, width, label='BERT Recall')
ax.bar(x + width, f1, width, label='BERT F1 Score')

additionalY = 0.465
ax.axvline(x = 2.5, color = '0.5')
ax.axvline(x = 5.5, color = '0.5')

ax.text((-0.5 + 2.5)/3, additionalY, 'Base Models', style='italic', bbox={'facecolor': '0.8', 'alpha': 0.5, 'pad': 5})
ax.text((-1 + 2.5)/3 + 3, additionalY, 'Basic Dataset Models', style='italic', bbox={'facecolor': '0.8', 'alpha': 0.5, 'pad': 5})
ax.text((-1 + 2.5)/3 + 6, additionalY, 'Advanced Dataset Models', style='italic', bbox={'facecolor': '0.8', 'alpha': 0.5, 'pad': 5})

# Labels & Titles
ax.set_xlabel("Models")
ax.set_ylabel("Score")
ax.set_title("BERTScores for Different Models")
ax.set_xticks(x)
ax.set_xticklabels(models, rotation=45, ha="right")
ax.set_ylim(-0.05, 0.56)


# Create a break in the y-axis
ax.set_yticks([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.55])
ax.set_yticklabels(["0", "0.1", "0.2", "0.3", "0.4", "0.5", "1"])
ax.spines['top'].set_visible(False)
# Add a break indicator (zigzag lines)
#ax.plot([-0.5, 8.75], [0.55, 0.55], "k--", lw=1)
#ax.plot([-0.5, 8.75], [0.5, 0.5], "k--", lw=1)
ax.axhline(y = 0.55, color = '0.5', linestyle="--")
ax.axhline(y = 0.5, color = '0.5', linestyle="--")


ax.legend(loc="upper left")

plt.tight_layout()
plt.savefig("YOUR_HOME_DIR/datasets/evaluation/deepseek_COT_raft/BERTScore_allModels.png")
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

#data.sort(key=lambda x: x["bert-f1"])

# Extract sorted values
precision = [d["bert-precision"][0] for d in data]
recall = [d["bert-recall"][0] for d in data]
f1 = [d["bert-f1"][0] for d in data]

# Number of groups
n = len(data)

# X locations for the bars
x = np.arange(n)

# Bar width
width = 0.25  

# Create bar chart
plt.figure(figsize=(8, 5))
plt.bar(x + width, f1, width, label="F1-score", color='r', alpha=0.7)
plt.bar(x, precision, width, label="Precision", color='b', alpha=0.7)
plt.bar(x - width, recall, width, label="Recall", color='y', alpha=0.7)

# Labels and title
plt.xlabel("Entries (Sorted by F1-score)")
plt.ylabel("Score")
plt.title("BERT Precision, Recall, and F1-score")
plt.xticks(x, [str(i) for i in range(n)], rotation=45, ha="right")
plt.legend()

# Show the plot
plt.show()

In [None]:
test = ds[0]
print("Input")
print(test["input"])
print("\n\n")
print("expected_output")
print(remove_thoughts_section(test["expected_output"]))
print("\n\n")
print("actual_output")
print(remove_thoughts_section(test["actual_output"]))