<a href="https://colab.research.google.com/github/ikramMc/PFE/blob/main/model_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install evaluate
!pip install rouge_score
!pip install  sentence-transformers  nltk bert-score

In [None]:
%%capture
!pip install pip3-autoremove
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu124
!pip install unsloth
# !pip install --upgrade transformers==4.52.3

# **Evaluation on test split using NLG and semantic metrics**

In [None]:
from google.colab import userdata
HF_token=userdata.get('HF_TOKEN')

In [None]:
import pandas as pd
from unsloth import FastLanguageModel
import random
import torch
import gc
import ast
import re
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score
from bert_score import score as bert_score
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer, util
from rouge_score import rouge_scorer
from unsloth.chat_templates import get_chat_template

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load dataset
data = pd.read_csv("test_with_sub_conversations.csv")

def parse_conversation(conv_str):
    fixed_str = re.sub(r'\}\s*\{', '}, {', conv_str)
    try:
        return ast.literal_eval(fixed_str)
    except Exception:
        return []

data["parsed_conversations"] = data["conversations"].apply(parse_conversation)

multi_turn_convs = data["parsed_conversations"].apply(lambda x: len(x) >= 4)
single_turn_convs = data["parsed_conversations"].apply(lambda x: len(x) == 2)

multi_turn_data = data[multi_turn_convs]["parsed_conversations"].tolist()
single_turn_data = data[single_turn_convs]["parsed_conversations"].tolist()

multi_turn_sample = random.sample(multi_turn_data, min(1000, len(multi_turn_data)))
single_turn_sample = random.sample(single_turn_data, min(1000, len(single_turn_data)))

print(f"Multi-turn conversations: {len(multi_turn_data)}")
print(f"Single-turn conversations: {len(single_turn_data)}")

models = {
    "qwen base": "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
    # Add other models as needed
}

# Initialize evaluation tools
embedder = SentenceTransformer("intfloat/e5-large-v2")
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
results = []

def calculate_meteor(predictions, references):
    """Calculate METEOR scores for a list of predictions and references"""
    meteor_scores = []
    for pred, ref in zip(predictions, references):
        try:
            # Tokenize the sentences
            pred_tokens = nltk.word_tokenize(pred.lower())
            ref_tokens = nltk.word_tokenize(ref.lower())

            # Calculate METEOR score
            meteor_score = single_meteor_score(ref_tokens, pred_tokens)
            meteor_scores.append(meteor_score)
        except Exception as e:
            print(f"Error calculating METEOR: {e}")
            meteor_scores.append(0.0)

    return sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0.0

def evaluate(model_name, model_path, conversations, mode="multi"):
    print(f"üîç Evaluating {model_name} on {mode}-turn")

    # Load model and tokenizer
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_path,
        max_seq_length=5000,
        dtype=torch.float16,
        load_in_4bit=True,
        token=HF_token
    )

    # Apply chat template
    tokenizer = get_chat_template(
        tokenizer,
        chat_template="qwen-2.5",
    )

    FastLanguageModel.for_inference(model)

    predictions, references = [], []

    for convo in conversations:
        for i in range(0, len(convo), 2):
            if i + 1 >= len(convo) or convo[i]["role"] != "user" or convo[i + 1]["role"] != "assistant":
                continue

            # Prepare conversation history up to current user message
            history = convo[:i + 1]
            prompt = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
            reference = convo[i + 1]["content"]

            try:
                with torch.no_grad():
                    # Tokenize input
                    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
                    input_length = inputs['input_ids'].shape[1]

                    print(f"Prompt: {prompt[:200]}...")  # Show first 200 chars
                    print("=" * 50)

                    # Generate response
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=200,
                        do_sample=True,
                        temperature=0.7,
                        top_p=0.9,
                        pad_token_id=tokenizer.eos_token_id
                    )

                    # Extract only the generated part (new tokens)
                    decoded_response = tokenizer.decode(outputs[0, input_length:], skip_special_tokens=True)
                    prediction = decoded_response.strip()

                    print(f"Generated: {prediction}")
                    print(f"Reference: {reference}")
                    print("-" * 50)

            except Exception as e:
                print(f"Error during generation: {e}")
                prediction = ""

            predictions.append(prediction)
            references.append(reference)

            # Clean up GPU memory
            del inputs, outputs
            gc.collect()
            torch.cuda.empty_cache()

    # Calculate ROUGE scores
    rouge_results = {"rouge1": [], "rouge2": [], "rougeL": []}
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        for key in rouge_results:
            rouge_results[key].append(scores[key])

    # Average ROUGE scores (precision, recall, f-measure)
    avg_rouge_scores = {}
    for key in rouge_results:
        avg_rouge_scores[f"{key}_precision"] = sum([s.precision for s in rouge_results[key]]) / len(rouge_results[key])
        avg_rouge_scores[f"{key}_recall"] = sum([s.recall for s in rouge_results[key]]) / len(rouge_results[key])
        avg_rouge_scores[f"{key}_fmeasure"] = sum([s.fmeasure for s in rouge_results[key]]) / len(rouge_results[key])

    # Calculate embedding similarity
    embeddings_preds = embedder.encode(predictions, convert_to_tensor=True)
    embeddings_refs = embedder.encode(references, convert_to_tensor=True)
    similarities = util.cos_sim(embeddings_preds, embeddings_refs).diagonal()
    avg_similarity = similarities.mean().item()

    # Calculate BLEU scores
    smoothing = SmoothingFunction().method1
    bleu_scores = []
    for pred, ref in zip(predictions, references):
        try:
            bleu_score = sentence_bleu([nltk.word_tokenize(ref)], nltk.word_tokenize(pred), smoothing_function=smoothing)
            bleu_scores.append(bleu_score)
        except Exception as e:
            print(f"Error calculating BLEU: {e}")
            bleu_scores.append(0.0)

    avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0.0

    # Calculate BERTScore (Precision, Recall, F1)
    try:
        P, R, F1 = bert_score(predictions, references, lang="en", verbose=False)
        avg_bertscore_precision = P.mean().item()
        avg_bertscore_recall = R.mean().item()
        avg_bertscore_f1 = F1.mean().item()
    except Exception as e:
        print(f"Error calculating BERTScore: {e}")
        avg_bertscore_precision = avg_bertscore_recall = avg_bertscore_f1 = 0.0

    # Calculate METEOR score
    avg_meteor = calculate_meteor(predictions, references)

    # Parse model metadata (if available)
    metadata = re.findall(r'r(\d+)_alpah(\d+)_batch(\d+)_gradient(\d+)_Ler([\de\.-]+)(_cste)?_?fulldataset(?:_ctfman)?_(\d+)?', model_path)
    if metadata:
        rank, alpha, batch, gradient, lr, constant, epoch = metadata[0]
        scheduler = "constant" if constant else "cosine"
        dataset = "ctfman" if "ctfman" in model_path else "fulldataset"
    else:
        rank = alpha = batch = gradient = lr = epoch = ""
        scheduler = dataset = ""

    # Store results
    result_entry = {
        "model_name": model_name,
        "mode": mode,
        "rank": rank,
        "alpha": alpha,
        "gradient": gradient,
        "batch": batch,
        "learning_rate": lr,
        "scheduler": scheduler,
        "dataset": dataset,
        "epoch": epoch,
        **avg_rouge_scores,
        "embedding_similarity": avg_similarity,
        "bleu": avg_bleu,
        "bertscore_precision": avg_bertscore_precision,
        "bertscore_recall": avg_bertscore_recall,
        "bertscore_f1": avg_bertscore_f1,
        "meteor": avg_meteor,
        "sample_prediction": predictions[0] if predictions else "",
        "sample_reference": references[0] if references else "",
        "total_samples": len(predictions)
    }

    results.append(result_entry)

    # Clean up model from memory
    del model, tokenizer
    gc.collect()
    torch.cuda.empty_cache()

# Run evaluation
for name, path in models.items():
    evaluate(name, path, multi_turn_sample, mode="multi")
    evaluate(name, path, single_turn_sample, mode="single")

# Save results
final_df = pd.DataFrame(results)
final_df.to_csv("model_evaluation_results.csv", index=False)

print("Evaluation completed!")
print(f"Results saved to model_evaluation_results.csv")
print("\nSample results:")
print(final_df[['model_name', 'mode', 'rouge1_fmeasure', 'bleu', 'bertscore_f1', 'meteor', 'embedding_similarity']].head())
final_df = pd.DataFrame(results)
final_df.to_csv("model_evaluation_results.csv", index=False)

# **MMLU Evaluation**

In [None]:
import pandas as pd
import torch
import gc
import random
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset, concatenate_datasets
from tqdm import tqdm  # ‚Üê Import tqdm for progress bar

# === Load all MMLU subcategories ===
mmlu_dataset = load_dataset("cais/mmlu", "all", trust_remote_code=True)

# Concatenate all test sets
all_subcategories = mmlu_dataset['test']
full_test_dataset = concatenate_datasets([
    all_subcategories.filter(lambda x: x["choices"] is not None)
])

# Sample 1000 examples randomly
sampled_dataset = full_test_dataset.shuffle(seed=42).select(range(1000))

# === Model setup ===
models = {
    "mistral base": "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "mistral fine-tuned": "kimxxxx/mistral_r64_a128_g8_gas8_lr9e-5_4500tk_droplast_3epoch"
}

# === Format MMLU prompt ===
def format_mmlu_prompt(example):
    return (
        f"Question: {example['question']}\n"
        f"A. {example['choices'][0]}\n"
        f"B. {example['choices'][1]}\n"
        f"C. {example['choices'][2]}\n"
        f"D. {example['choices'][3]}\n"
        "Answer:"
    )

# === Evaluate function ===
def evaluate_on_mmlu(model_name, model_path, dataset):
    print(f"üß™ Evaluating {model_name} on MMLU (1000 samples)...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_path,
        max_seq_length=2048,
        dtype=torch.float16,
        load_in_4bit=True,
        token=HF_token
    )
    tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5")
    FastLanguageModel.for_inference(model)

    correct = 0
    results = []

    for idx, example in enumerate(tqdm(dataset, desc=f"Evaluating {model_name}", ncols=100)):
        prompt = format_mmlu_prompt(example)
        correct_answer = example["answer"]

        try:
            with torch.no_grad():
                inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
                outputs = model.generate(**inputs, max_new_tokens=5)
                prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
                prediction = prediction.strip().upper()

            letter = next((c for c in prediction if c in "ABCD"), "")

            is_correct = (letter == ["A", "B", "C", "D"][correct_answer])
            correct += int(is_correct)

            results.append({
                "index": idx,
                "subject": example["subject"],
                "question": example["question"],
                "prediction": letter,
                "correct_answer": ["A", "B", "C", "D"][correct_answer],
                "is_correct": is_correct,
                "raw_output": prediction
            })

        except Exception as e:
            print(f"‚ö†Ô∏è Error on item {idx}: {e}")
            results.append({
                "index": idx,
                "subject": example["subject"],
                "question": example["question"],
                "prediction": "ERROR",
                "correct_answer": ["A", "B", "C", "D"][correct_answer],
                "is_correct": False,
                "raw_output": str(e)
            })

        gc.collect()
        torch.cuda.empty_cache()

    accuracy = correct / len(dataset) * 100
    print(f"‚úÖ {model_name} Accuracy on MMLU (1000 samples): {accuracy:.2f}%")
    return pd.DataFrame(results)

# === Run evaluation ===
for name, path in models.items():
    df = evaluate_on_mmlu(name, path, sampled_dataset)
    df.to_csv(f"{name.replace(' ', '_').lower()}_mmlu_1000sample_results.csv", index=False)
