In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import numpy as np
import evaluate
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score as bert_score
from collections import Counter

In [4]:
# Load Hugging Face metrics
rouge = evaluate.load("rouge")
perplexity_metric = evaluate.load("perplexity", module_type="metric")

# Load the dataset with responses and expected outputs
df = pd.read_csv("final_combined_with_expected_responses.csv")

# Ensure responses are not empty or invalid
df = df[df["output"].notna()]

# Load model and tokenizer for perplexity calculation
perplexity_model_name = "EleutherAI/gpt-neo-1.3B"  # Smaller model to save resources
perplexity_tokenizer = AutoTokenizer.from_pretrained(perplexity_model_name)
perplexity_model = AutoModelForCausalLM.from_pretrained(perplexity_model_name).to("cuda")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.46k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

In [5]:


# Function to compute Perplexity
def compute_perplexity(text):
    encodings = perplexity_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = perplexity_model(**encodings, labels=encodings["input_ids"])
        loss = outputs.loss
    return torch.exp(loss).item()  # Perplexity is e^loss

# Compute DISTINCT-1 & DISTINCT-2 (Measures response diversity)
def compute_distinct_ngram(texts, n):
    ngram_counts = Counter()
    total_ngrams = 0
    for text in texts:
        words = text.split()
        ngrams = [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
        ngram_counts.update(ngrams)
        total_ngrams += len(ngrams)
    
    return len(ngram_counts) / total_ngrams if total_ngrams > 0 else 0

# Compute BLEU score
def compute_bleu(reference, hypothesis):
    ref_tokens = reference.split()
    hyp_tokens = hypothesis.split()
    return sentence_bleu([ref_tokens], hyp_tokens)

# Define models to evaluate from the CSV file
model_versions = [
    ("response_with_think", "generated_Few_Shot_DeepMental_responses"),
    ("response_with_think", "generated_Fine_Tune_DeepMental_responses"),
    ("response_with_think", "generated_Fine_Tune_Traditional_responses"),
    ("response_without_think", "generated_Few_Shot_DeepMental_responses"),
    ("response_without_think", "generated_Fine_Tune_DeepMental_responses")
]

In [None]:


# Initialize evaluation storage
evaluation_results = []

# Run evaluation for each model
for response_column, model_name in model_versions:
    print(f"Evaluating: {model_name} ({response_column})...")

    df_model = df[df["model"] == model_name]
    
    predictions = df_model[response_column].tolist()
    references = df_model["output"].tolist()

    # Compute Perplexity
    perplexities = [compute_perplexity(text) for text in predictions]

    # Compute ROUGE scores
    rouge_scores = rouge.compute(predictions=predictions, references=references)

    # Compute BLEU scores
    bleu_scores = [compute_bleu(ref, pred) for ref, pred in zip(references, predictions)]

    # Compute BERTScore (Precision, Recall, F1)
    P, R, F1 = bert_score(predictions, references, lang="en", model_type="microsoft/deberta-xlarge-mnli")

    # Compute DISTINCT scores
    distinct_1 = compute_distinct_ngram(predictions, 1)
    distinct_2 = compute_distinct_ngram(predictions, 2)

    # Store results
    evaluation_results.append({
        "Model": model_name,
        "Response Type": response_column,
        "Perplexity (Avg)": np.mean(perplexities),
        "ROUGE-1": rouge_scores["rouge1"],
        "ROUGE-2": rouge_scores["rouge2"],
        "ROUGE-L": rouge_scores["rougeL"],
        "BLEU": np.mean(bleu_scores),
        "BERTScore Precision": np.mean(P.tolist()),
        "BERTScore Recall": np.mean(R.tolist()),
        "BERTScore F1": np.mean(F1.tolist()),
        "Distinct-1": distinct_1,
        "Distinct-2": distinct_2
    })

# Convert results to DataFrame
df_results = pd.DataFrame(evaluation_results)

# Save evaluation results
df_results.to_csv("evaluation_results.csv", index=False)

print("✅ Evaluation complete! Results saved in 'evaluation_results.csv'")



In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import numpy as np
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score
from collections import Counter
from tqdm import tqdm  # Progress bar for better tracking

2025-02-27 16:39:50.124879: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740674390.138126    5920 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740674390.142052    5920 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-27 16:39:50.157470: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load Hugging Face metrics
rouge = evaluate.load("rouge")

# Load the dataset with responses and expected outputs
df = pd.read_csv("final_combined_with_expected_responses.csv")

# Ensure responses are not empty or invalid
df = df[df["output"].notna()]

# Load model and tokenizer for perplexity calculation
perplexity_model_name = "EleutherAI/gpt-neo-1.3B"  # Efficient model
perplexity_tokenizer = AutoTokenizer.from_pretrained(perplexity_model_name)
perplexity_model = AutoModelForCausalLM.from_pretrained(perplexity_model_name).to("cuda")

In [3]:
# Function to compute Perplexity (Batch Processing)
def compute_perplexity_batch(texts):
    encoded_inputs = perplexity_tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = perplexity_model(**encoded_inputs, labels=encoded_inputs["input_ids"])
        losses = outputs.loss
    return torch.exp(losses).cpu().numpy()  # Return as numpy array for efficiency

# Compute DISTINCT-1 & DISTINCT-2 (Measures response diversity)
def compute_distinct_ngram(texts, n):
    ngram_counts = Counter()
    total_ngrams = 0
    for text in texts:
        words = text.split()
        if len(words) < n:  # Skip very short responses
            continue
        ngrams = [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
        ngram_counts.update(ngrams)
        total_ngrams += len(ngrams)
    
    return len(ngram_counts) / total_ngrams if total_ngrams > 0 else 0

# Compute BLEU score (with Smoothing)
def compute_bleu(reference, hypothesis):
    ref_tokens = reference.split()
    hyp_tokens = hypothesis.split()
    if len(hyp_tokens) == 0:  # Prevent errors on empty responses
        return 0.0
    return sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=SmoothingFunction().method1)

# Define models to evaluate from the CSV file
model_versions = [
    ("response_with_think", "generated_Few_Shot_DeepMental_responses"),
    ("response_with_think", "generated_Fine_Tune_DeepMental_responses"),
    ("response_with_think", "generated_Fine_Tune_Traditional_responses"),
    ("response_without_think", "generated_Few_Shot_DeepMental_responses"),
    ("response_without_think", "generated_Fine_Tune_DeepMental_responses")
]

In [None]:






# Initialize evaluation storage
evaluation_results = []

# Run evaluation for each model
for response_column, model_name in model_versions:
    print(f"Evaluating: {model_name} ({response_column})...")

    df_model = df[df["model"] == model_name]
    
    predictions = df_model[response_column].tolist()
    references = df_model["output"].tolist()

    # Compute Perplexity (Batch Processing)
    batch_size = 32  # Process 32 responses at a time
    perplexities = []
    for i in tqdm(range(0, len(predictions), batch_size), desc=f"Perplexity for {model_name}"):
        batch = predictions[i:i+batch_size]
        perplexities.extend(compute_perplexity_batch(batch))

    # Compute ROUGE scores
    rouge_scores = rouge.compute(predictions=predictions, references=references)

    # Compute BLEU scores
    bleu_scores = [compute_bleu(ref, pred) for ref, pred in zip(references, predictions)]

    # Compute BERTScore (Precision, Recall, F1)
    P, R, F1 = bert_score(predictions, references, lang="en", model_type="microsoft/deberta-xlarge-mnli")

    # Compute DISTINCT scores
    distinct_1 = compute_distinct_ngram(predictions, 1)
    distinct_2 = compute_distinct_ngram(predictions, 2)

    # Store results
    evaluation_results.append({
        "Model": model_name,
        "Response Type": response_column,
        "Perplexity (Avg)": np.mean(perplexities),
        "ROUGE-1": rouge_scores["rouge1"],
        "ROUGE-2": rouge_scores["rouge2"],
        "ROUGE-L": rouge_scores["rougeL"],
        "BLEU": np.mean(bleu_scores),
        "BERTScore Precision": np.mean(P.tolist()),
        "BERTScore Recall": np.mean(R.tolist()),
        "BERTScore F1": np.mean(F1.tolist()),
        "Distinct-1": distinct_1,
        "Distinct-2": distinct_2
    })

# Convert results to DataFrame
df_results = pd.DataFrame(evaluation_results)

# Save evaluation results
df_results.to_csv("evaluation_results.csv", index=False)

print("✅ Evaluation complete! Results saved in 'evaluation_results.csv'")

# Display results
import ace_tools as tools
tools.display_dataframe_to_user(name="Evaluation Results", dataframe=df_results)


In [1]:
# Import necessary libraries
import torch
from unsloth import FastLanguageModel
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import numpy as np
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score
from collections import Counter
from tqdm import tqdm  # Progress bar for better tracking



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


2025-02-27 17:06:34.325012: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740675994.338655    7578 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740675994.342544    7578 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-27 17:06:34.357959: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load Hugging Face metrics
rouge = evaluate.load("rouge")

# Load the dataset with responses and expected outputs
df = pd.read_csv("final_combined_with_expected_responses.csv")

# Ensure responses are not empty or invalid
df = df[df["output"].notna()]

# Load Llama 1B model from Unsloth
model_name = "unsloth/Llama-3.2-1B"
max_seq_length = 10000
dtype = None
load_in_4bit = True  # Efficient quantization

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# Enable faster inference with Unsloth
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    GPU: NVIDIA A100 80GB PCIe. Max memory: 79.151 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), 

In [6]:
# Define models to evaluate from the CSV file
model_versions = [
    ("response_with_think", "generated_Few_Shot_DeepMental_responses"),
    ("response_with_think", "generated_Fine_Tune_DeepMental_responses"),
    ("response_with_think", "generated_Fine_Tune_Traditional_responses"),
    ("response_without_think", "generated_Few_Shot_DeepMental_responses"),
    ("response_without_think", "generated_Fine_Tune_DeepMental_responses"),
    ]

In [7]:
# --- Perplexity Computation ---
def compute_perplexity_batch(texts):
    if not isinstance(texts, list) or len(texts) == 0:
        return []
    texts = [str(text) if isinstance(text, str) else "" for text in texts]
    encoded_inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = model(**encoded_inputs, labels=encoded_inputs["input_ids"])
        losses = outputs.loss
    if losses.numel() == 1:
        return [torch.exp(losses).item()] * len(texts)
    else:
        return torch.exp(losses).cpu().tolist()

perplexity_results = []
for response_column, model_name in model_versions:
    df_model = df[df["model"] == model_name]
    predictions = df_model[response_column].tolist()
    batch_size = 32
    perplexities = []
    for i in tqdm(range(0, len(predictions), batch_size), desc=f"Perplexity for {model_name}"):
        batch = predictions[i:i+batch_size]
        perplexities.extend(compute_perplexity_batch(batch))
    perplexity_results.append({"Model": model_name, "Response Type": response_column, "Perplexity (Avg)": np.mean(perplexities)})

df_perplexity = pd.DataFrame(perplexity_results)
df_perplexity.to_csv("perplexity_results.csv", index=False)

Perplexity for generated_Few_Shot_DeepMental_responses: 100%|██████████| 36/36 [00:08<00:00,  4.47it/s]
Perplexity for generated_Fine_Tune_DeepMental_responses: 100%|██████████| 36/36 [00:12<00:00,  2.98it/s]
Perplexity for generated_Fine_Tune_Traditional_responses: 100%|██████████| 36/36 [00:07<00:00,  5.10it/s]
Perplexity for generated_Few_Shot_DeepMental_responses: 100%|██████████| 36/36 [00:07<00:00,  5.04it/s]
Perplexity for generated_Fine_Tune_DeepMental_responses: 100%|██████████| 36/36 [00:12<00:00,  2.79it/s]


In [None]:


# Compute Perplexity
perplexity_results = []
for response_column, model_name in model_versions:
    df_model = df[df["model"] == model_name]

    if df_model.empty:
        print(f"Skipping {model_name} - No data found.")
        continue

    predictions = df_model[response_column].tolist()
    batch_size = 32
    perplexities = []

    for i in tqdm(range(0, len(predictions), batch_size), desc=f"Perplexity for {model_name}"):
        batch = predictions[i:i+batch_size]
        perplexities.extend(compute_perplexity_batch(batch))

    avg_perplexity = np.mean(perplexities) if perplexities else float("inf")
    perplexity_results.append({"Model": model_name, "Response Type": response_column, "Perplexity (Avg)": avg_perplexity})

# Save Results
df_perplexity = pd.DataFrame(perplexity_results)
df_perplexity.to_csv("perplexity_results.csv", index=False)

print("Perplexity computation completed and saved.")


In [8]:
# --- ROUGE Computation ---
rouge_results = []
for response_column, model_name in model_versions:
    df_model = df[df["model"] == model_name]
    predictions = df_model[response_column].tolist()
    references = df_model["output"].tolist()
    rouge_scores = rouge.compute(predictions=predictions, references=references)
    rouge_results.append({
        "Model": model_name, "Response Type": response_column,
        "ROUGE-1": rouge_scores["rouge1"], "ROUGE-2": rouge_scores["rouge2"], "ROUGE-L": rouge_scores["rougeL"]
    })

df_rouge = pd.DataFrame(rouge_results)
df_rouge.to_csv("rouge_results.csv", index=False)

In [9]:

# --- BLEU Computation ---
def compute_bleu(reference, hypothesis):
    reference = str(reference) if isinstance(reference, str) else ""
    hypothesis = str(hypothesis) if isinstance(hypothesis, str) else ""
    ref_tokens = reference.split()
    hyp_tokens = hypothesis.split()
    if len(hyp_tokens) == 0:
        return 0.0
    return sentence_bleu([ref_tokens], hyp_tokens, smoothing_function=SmoothingFunction().method1)

bleu_results = []
for response_column, model_name in model_versions:
    df_model = df[df["model"] == model_name]
    predictions = df_model[response_column].tolist()
    references = df_model["output"].tolist()
    bleu_scores = [compute_bleu(ref, pred) for ref, pred in zip(references, predictions)]
    bleu_results.append({"Model": model_name, "Response Type": response_column, "BLEU": np.mean(bleu_scores)})

df_bleu = pd.DataFrame(bleu_results)
df_bleu.to_csv("bleu_results.csv", index=False)

In [10]:

# --- BERTScore Computation ---
bert_results = []
for response_column, model_name in model_versions:
    df_model = df[df["model"] == model_name]
    predictions = df_model[response_column].tolist()
    references = df_model["output"].tolist()
    predictions = [str(p) if isinstance(p, str) else "" for p in predictions]
    references = [str(r) if isinstance(r, str) else "" for r in references]
    P, R, F1 = bert_score(predictions, references, lang="en", model_type="microsoft/deberta-xlarge-mnli")
    bert_results.append({
        "Model": model_name, "Response Type": response_column,
        "BERTScore Precision": np.mean(P.tolist()), "BERTScore Recall": np.mean(R.tolist()), "BERTScore F1": np.mean(F1.tolist())
    })

df_bert = pd.DataFrame(bert_results)
df_bert.to_csv("bert_results.csv", index=False)



In [11]:
# --- DISTINCT Computation ---
def compute_distinct_ngram(texts, n):
    ngram_counts = Counter()
    total_ngrams = 0
    for text in texts:
        words = str(text).split()
        if len(words) < n:
            continue
        ngrams = [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
        ngram_counts.update(ngrams)
        total_ngrams += len(ngrams)
    return len(ngram_counts) / total_ngrams if total_ngrams > 0 else 0

distinct_results = []
for response_column, model_name in model_versions:
    df_model = df[df["model"] == model_name]
    predictions = df_model[response_column].tolist()
    distinct_1 = compute_distinct_ngram(predictions, 1)
    distinct_2 = compute_distinct_ngram(predictions, 2)
    distinct_results.append({"Model": model_name, "Response Type": response_column, "Distinct-1": distinct_1, "Distinct-2": distinct_2})

df_distinct = pd.DataFrame(distinct_results)
df_distinct.to_csv("distinct_results.csv", index=False)

In [12]:







# --- Combine all results ---
df_final = df_perplexity.merge(df_rouge, on=["Model", "Response Type"]).merge(df_bleu, on=["Model", "Response Type"]).merge(df_bert, on=["Model", "Response Type"]).merge(df_distinct, on=["Model", "Response Type"])
df_final.to_csv("final_evaluation_results.csv", index=False)


In [None]:
# Import necessary libraries
import torch
import pandas as pd
import numpy as np
import evaluate
from tqdm import tqdm  # Progress bar for better tracking
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score
from torchmetrics.text import Perplexity


# Load Hugging Face evaluation metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

perplexity_metric = Perplexity()

# Load dataset
df = pd.read_csv("final_combined_with_expected_responses.csv")

# Ensure responses are valid
df = df[df["output"].notna()]

# Define models to evaluate from the CSV file
model_versions = [
    ("response_with_think", "generated_Few_Shot_DeepMental_responses"),
    ("response_with_think", "generated_Fine_Tune_DeepMental_responses"),
    ("response_with_think", "generated_Fine_Tune_Traditional_responses"),
    ("response_without_think", "generated_Few_Shot_DeepMental_responses"),
    ("response_without_think", "generated_Fine_Tune_DeepMental_responses")
]

# Step 1: Compute Perplexity using TorchMetrics
perplexity_results = []
batch_size = 32

for response_column, model_name in model_versions:
    print(f"Computing Perplexity: {model_name} ({response_column})...")

    df_model = df[df["model"] == model_name]
    predictions = df_model[response_column].tolist()

    perplexities = []
    for i in tqdm(range(0, len(predictions), batch_size), desc=f"Perplexity for {model_name}"):
        batch = predictions[i:i+batch_size]
        perplexities.extend(perplexity_metric(batch))  # Using torchmetrics

    perplexity_results.append({"Model": model_name, "Response Type": response_column, "Perplexity (Avg)": np.mean(perplexities)})

df_perplexity = pd.DataFrame(perplexity_results)
df_perplexity.to_csv("evaluation_perplexity.csv", index=False)
print("✅ Perplexity results saved in 'evaluation_perplexity.csv'")

# Step 2: Compute ROUGE using Hugging Face `evaluate`
rouge_results = []
for response_column, model_name in model_versions:
    print(f"Computing ROUGE: {model_name} ({response_column})...")

    df_model = df[df["model"] == model_name]
    predictions = df_model[response_column].tolist()
    references = df_model["output"].tolist()

    rouge_scores = rouge.compute(predictions=predictions, references=references)
    
    rouge_results.append({
        "Model": model_name, "Response Type": response_column,
        "ROUGE-1": rouge_scores["rouge1"], "ROUGE-2": rouge_scores["rouge2"], "ROUGE-L": rouge_scores["rougeL"]
    })

df_rouge = pd.DataFrame(rouge_results)
df_rouge.to_csv("evaluation_rouge.csv", index=False)
print("✅ ROUGE results saved in 'evaluation_rouge.csv'")

# Step 3: Compute BLEU using Hugging Face `evaluate`
bleu_results = []
for response_column, model_name in model_versions:
    print(f"Computing BLEU: {model_name} ({response_column})...")

    df_model = df[df["model"] == model_name]
    predictions = df_model[response_column].tolist()
    references = df_model["output"].tolist()

    bleu_scores = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
    
    bleu_results.append({"Model": model_name, "Response Type": response_column, "BLEU": bleu_scores["bleu"]})

df_bleu = pd.DataFrame(bleu_results)
df_bleu.to_csv("evaluation_bleu.csv", index=False)
print("✅ BLEU results saved in 'evaluation_bleu.csv'")

# Step 4: Compute BERTScore using `bert_score`
bert_results = []
for response_column, model_name in model_versions:
    print(f"Computing BERTScore: {model_name} ({response_column})...")

    df_model = df[df["model"] == model_name]
    predictions = df_model[response_column].astype(str).tolist()
    references = df_model["output"].astype(str).tolist()

    P, R, F1 = bert_score(predictions, references, lang="en", model_type="microsoft/deberta-xlarge-mnli")

    bert_results.append({
        "Model": model_name, "Response Type": response_column,
        "BERTScore Precision": np.mean(P.tolist()), "BERTScore Recall": np.mean(R.tolist()), "BERTScore F1": np.mean(F1.tolist())
    })

df_bert = pd.DataFrame(bert_results)
df_bert.to_csv("evaluation_bert.csv", index=False)
print("✅ BERTScore results saved in 'evaluation_bert.csv'")

# Step 5: Compute DISTINCT using Hugging Face `evaluate`
distinct_results = []
for response_column, model_name in model_versions:
    print(f"Computing DISTINCT: {model_name} ({response_column})...")

    df_model = df[df["model"] == model_name]
    predictions = df_model[response_column].astype(str).tolist()

    distinct_scores = distinct.compute(predictions=predictions)
    
    distinct_results.append({"Model": model_name, "Response Type": response_column, "Distinct-1": distinct_scores["distinct1"], "Distinct-2": distinct_scores["distinct2"]})

df_distinct = pd.DataFrame(distinct_results)
df_distinct.to_csv("evaluation_distinct.csv", index=False)
print("✅ DISTINCT results saved in 'evaluation_distinct.csv'")

# Step 6: Combine all results
df_final = df_perplexity.merge(df_rouge, on=["Model", "Response Type"], how="left")
df_final = df_final.merge(df_bleu, on=["Model", "Response Type"], how="left")
df_final = df_final.merge(df_bert, on=["Model", "Response Type"], how="left")
df_final = df_final.merge(df_distinct, on=["Model", "Response Type"], how="left")

df_final.to_csv("evaluation_results.csv", index=False)
print("✅ Final evaluation results saved in 'evaluation_results.csv'")

# Display final results
tools.display_dataframe_to_user(name="Final Evaluation Results", dataframe=df_final)
