#### Install Unsloth on colab

In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth

!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2
!pip install rouge-score
!pip install bert-score
! pip install evaluate

Complete Evaluation Script with Comparison Visualizations

---


Evaluates both base and fine-tuned models and generates comparison charts


In [None]:
from unsloth import FastVisionModel
from datasets import load_dataset
import torch
import gc
import numpy as np
import json
import matplotlib.pyplot as plt
from datetime import datetime
from pathlib import Path
from collections import defaultdict

# NLP and evaluation imports with safety checks
try:
    from rouge_score import rouge_scorer
    ROUGE_AVAILABLE = True
except:
    ROUGE_AVAILABLE = False
    print("⚠ rouge_score not available")

try:
    from bert_score import score as bert_score
    BERTSCORE_AVAILABLE = True
except:
    BERTSCORE_AVAILABLE = False
    print("⚠ bert_score not available")

try:
    from evaluate import load
    METEOR_AVAILABLE = True
except:
    METEOR_AVAILABLE = False
    print("⚠ evaluate (METEOR) not available")

try:
    from sentence_transformers import SentenceTransformer, util
    SENTENCE_TRANSFORMER_AVAILABLE = True
except:
    SENTENCE_TRANSFORMER_AVAILABLE = False
    print("⚠ sentence_transformers not available")

try:
    import spacy
    SPACY_AVAILABLE = True
except:
    SPACY_AVAILABLE = False
    print("⚠ spacy not available")

try:
    from sklearn.metrics import f1_score
    SKLEARN_AVAILABLE = True
except:
    SKLEARN_AVAILABLE = False
    print("⚠ sklearn not available")

class MedicalEvaluator:
    """Comprehensive medical report evaluator with visualization"""

    def __init__(self, output_dir="evaluation_results"):
        self.eval_results = []
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True, parents=True)

        # Load spacy if available
        self.nlp = None
        if SPACY_AVAILABLE:
            try:
                self.nlp = spacy.load("en_core_web_sm")
            except:
                print("⚠ Could not load spacy model")

        # Medical terminology
        self.anatomical_terms = {
            "chest": ["lung", "heart", "chest", "thorax", "rib", "sternum",
                     "mediastinum", "pleura", "diaphragm", "trachea", "bronch"],
            "abdomen": ["liver", "kidney", "spleen", "pancreas", "gallbladder",
                       "stomach", "intestine", "bowel", "periton"],
            "head": ["brain", "skull", "sinus", "orbit", "temporal", "frontal",
                    "parietal", "occipital", "cerebral"],
            "spine": ["vertebra", "disc", "spinal", "cervical", "thoracic",
                     "lumbar", "sacral", "coccyx"]
        }

        self.pathology_terms = {
            "normal": ["normal", "unremarkable", "clear", "negative",
                      "no abnormality", "within normal limits", "intact"],
            "abnormal": ["abnormal", "lesion", "mass", "nodule", "opacity",
                        "consolidation", "effusion", "pneumonia", "fracture",
                        "edema", "inflammation", "infection", "tumor",
                        "hemorrhage", "atelectasis"]
        }

        self.severity_terms = ["mild", "moderate", "severe", "marked",
                              "significant", "minimal", "extensive", "diffuse"]

        self.negation_phrases = [
            "no evidence of", "no definite signs of", "without evidence of",
            "no sign of", "not seen", "negative for", "absence of",
            "unremarkable for", "free of", "no demonstration of",
            "there is no", "cannot be identified", "not identified",
            "rule out", "r/o", "unlikely"
        ]

        self.location_terms = ["left", "right", "bilateral", "upper", "lower",
                              "anterior", "posterior", "medial", "lateral",
                              "proximal", "distal", "central", "peripheral"]

    def extract_medical_entities(self, text):
        """Extract and classify medical entities from text"""
        text_lower = text.lower()

        entities = {
            "anatomical": [],
            "pathological": [],
            "severity": [],
            "negations": [],
            "locations": []
        }

        # Extract anatomical structures
        for region, terms in self.anatomical_terms.items():
            for term in terms:
                if term in text_lower:
                    entities["anatomical"].append(term)

        # Extract pathological findings
        for category, terms in self.pathology_terms.items():
            for term in terms:
                if term in text_lower:
                    entities["pathological"].append((term, category))

        # Extract severity indicators
        for term in self.severity_terms:
            if term in text_lower:
                entities["severity"].append(term)

        # Extract negations
        for phrase in self.negation_phrases:
            if phrase in text_lower:
                entities["negations"].append(phrase)

        # Extract locations
        for term in self.location_terms:
            if term in text_lower:
                entities["locations"].append(term)

        return entities

    def compute_medical_metrics(self, preds, refs):
        """Compute medical-specific evaluation metrics"""
        medical_scores = {
            "anatomical_f1": 0.0,
            "pathology_classification_f1": 0.0,
            "negation_handling_accuracy": 0.0,
            "severity_matching_accuracy": 0.0,
            "location_accuracy": 0.0,
            "medical_terminology_coverage": 0.0,
            "report_completeness": 0.0
        }

        pred_entities = [self.extract_medical_entities(p) for p in preds]
        ref_entities = [self.extract_medical_entities(r) for r in refs]

        # 1. Anatomical Structure F1
        anatomy_f1s = []
        for pred_ent, ref_ent in zip(pred_entities, ref_entities):
            pred_anat = set(pred_ent["anatomical"])
            ref_anat = set(ref_ent["anatomical"])

            if len(ref_anat) == 0 and len(pred_anat) == 0:
                f1 = 1.0
            elif len(ref_anat) == 0 or len(pred_anat) == 0:
                f1 = 0.0
            else:
                intersection = len(pred_anat & ref_anat)
                precision = intersection / len(pred_anat) if len(pred_anat) > 0 else 0
                recall = intersection / len(ref_anat) if len(ref_anat) > 0 else 0
                f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
            anatomy_f1s.append(f1)

        medical_scores["anatomical_f1"] = np.mean(anatomy_f1s)

        # 2. Pathology Classification
        if SKLEARN_AVAILABLE:
            pred_pathology = []
            ref_pathology = []

            for pe, re in zip(pred_entities, ref_entities):
                pred_abnormal = any(cat == "abnormal" for _, cat in pe["pathological"])
                ref_abnormal = any(cat == "abnormal" for _, cat in re["pathological"])
                pred_pathology.append(1 if pred_abnormal else 0)
                ref_pathology.append(1 if ref_abnormal else 0)

            if len(set(ref_pathology)) > 1:
                medical_scores["pathology_classification_f1"] = f1_score(
                    ref_pathology, pred_pathology, average='weighted'
                )

        # 3. Negation Handling
        pred_negations = [len(pe["negations"]) > 0 for pe in pred_entities]
        ref_negations = [len(re["negations"]) > 0 for re in ref_entities]
        negation_matches = sum(1 for p, r in zip(pred_negations, ref_negations) if p == r)
        medical_scores["negation_handling_accuracy"] = negation_matches / len(preds) if len(preds) > 0 else 0

        # 4. Severity Matching
        severity_matches = 0
        for pe, re in zip(pred_entities, ref_entities):
            pred_sev = set(pe["severity"])
            ref_sev = set(re["severity"])
            if len(ref_sev) == 0:
                if len(pred_sev) == 0:
                    severity_matches += 1
            else:
                if len(pred_sev & ref_sev) > 0:
                    severity_matches += 1
        medical_scores["severity_matching_accuracy"] = severity_matches / len(preds) if len(preds) > 0 else 0

        # 5. Location Accuracy
        location_matches = 0
        for pe, re in zip(pred_entities, ref_entities):
            pred_loc = set(pe["locations"])
            ref_loc = set(re["locations"])
            if len(ref_loc) == 0:
                if len(pred_loc) == 0:
                    location_matches += 1
            else:
                if len(pred_loc & ref_loc) > 0:
                    location_matches += 1
        medical_scores["location_accuracy"] = location_matches / len(preds) if len(preds) > 0 else 0

        # 6. Medical Terminology Coverage
        all_medical_terms = set()
        for terms_dict in self.anatomical_terms.values():
            all_medical_terms.update(terms_dict)
        for terms_list in self.pathology_terms.values():
            all_medical_terms.update(terms_list)
        all_medical_terms.update(self.severity_terms)
        all_medical_terms.update(self.location_terms)

        terminology_scores = []
        for pred, ref in zip(preds, refs):
            pred_terms = set(term for term in all_medical_terms if term in pred.lower())
            ref_terms = set(term for term in all_medical_terms if term in ref.lower())

            if len(ref_terms) == 0:
                score = 1.0 if len(pred_terms) == 0 else 0.5
            else:
                overlap = len(pred_terms & ref_terms)
                score = overlap / len(ref_terms)
            terminology_scores.append(score)

        medical_scores["medical_terminology_coverage"] = np.mean(terminology_scores)

        # 7. Report Completeness
        completeness_scores = []
        for pred_ent, ref_ent in zip(pred_entities, ref_entities):
            score = 0
            total = 0

            if ref_ent["anatomical"]:
                total += 1
                if pred_ent["anatomical"]:
                    score += len(set(pred_ent["anatomical"]) & set(ref_ent["anatomical"])) / len(set(ref_ent["anatomical"]))

            if ref_ent["pathological"]:
                total += 1
                ref_path = set(p[0] for p in ref_ent["pathological"])
                pred_path = set(p[0] for p in pred_ent["pathological"])
                if pred_path:
                    score += len(pred_path & ref_path) / len(ref_path)

            if total > 0:
                completeness_scores.append(score / total)
            else:
                completeness_scores.append(1.0)

        medical_scores["report_completeness"] = np.mean(completeness_scores)

        return medical_scores

    def compute_comprehensive_metrics(self, preds, refs):
        """Compute both standard and medical-specific metrics"""
        metrics = {}

        # Standard NLP metrics
        if ROUGE_AVAILABLE:
            rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
            rouge_scores = [rouge.score(ref, pred) for pred, ref in zip(preds, refs)]
            metrics['rouge1_f1'] = np.mean([s['rouge1'].fmeasure for s in rouge_scores])
            metrics['rouge2_f1'] = np.mean([s['rouge2'].fmeasure for s in rouge_scores])
            metrics['rougeL_f1'] = np.mean([s['rougeL'].fmeasure for s in rouge_scores])

        # BERTScore
        if BERTSCORE_AVAILABLE:
            try:
                P, R, F1 = bert_score(preds, refs, lang='en', verbose=False)
                metrics['bertscore_f1'] = F1.mean().item()
            except:
                metrics['bertscore_f1'] = 0.0

        # METEOR
        if METEOR_AVAILABLE:
            try:
                meteor = load('meteor')
                metrics['meteor_score'] = meteor.compute(predictions=preds, references=refs)['meteor']
            except:
                metrics['meteor_score'] = 0.0

        # Semantic Similarity
        if SENTENCE_TRANSFORMER_AVAILABLE:
            try:
                embedder = SentenceTransformer('all-MiniLM-L6-v2')
                pred_emb = embedder.encode(preds)
                ref_emb = embedder.encode(refs)
                sem_sim = [util.cos_sim(p, r).item() for p, r in zip(pred_emb, ref_emb)]
                metrics['semantic_similarity'] = np.mean(sem_sim)
            except:
                metrics['semantic_similarity'] = 0.0

        # Medical-specific metrics
        medical_metrics = self.compute_medical_metrics(preds, refs)
        metrics.update(medical_metrics)

        return metrics

    def evaluate_model(self, model, tokenizer, test_samples, model_name, args, num_samples=None):
        """Evaluate model on test samples"""
        print(f"\n{'='*60}")
        print(f"Evaluating Model: {model_name}")
        print(f"{'='*60}\n")

        # Prepare model for inference
        FastVisionModel.for_inference(model)

        preds, refs = [], []
        samples_to_eval = test_samples if num_samples is None else test_samples[:num_samples]

        # Generate predictions
        for i, sample in enumerate(samples_to_eval):
            if i % 10 == 0:
                print(f"Processing {i}/{len(samples_to_eval)}...")

            try:
                # Extract images and reference
                images = [c["image"] for c in sample["messages"][0]["content"] if c["type"] == "image"]
                instruction = [c["text"] for c in sample["messages"][0]["content"] if c["type"] == "text"][0]
                ref = sample["messages"][1]["content"][0]["text"].strip()
                refs.append(ref)

                # Prepare input
                messages = [
                    {"role": "user", "content": [
                        {"type": "image"},
                        {"type": "text", "text": instruction}
                    ]}
                ]

                text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
                inputs = tokenizer(images[0], text, add_special_tokens=False, return_tensors="pt").to("cuda")

                # Generate prediction
                with torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        max_new_tokens=256,
                        temperature=args.get("temperature", 1.5),
                        min_p=0.1,
                        do_sample=True,
                        pad_token_id=tokenizer.eos_token_id
                    )

                generated = tokenizer.decode(
                    outputs[0][inputs.input_ids.shape[1]:],
                    skip_special_tokens=True
                ).strip()

                preds.append(generated)

            except Exception as e:
                print(f"⚠ Error on sample {i}: {e}")
                preds.append("")

        # Compute metrics
        print(f"\nComputing evaluation metrics...")
        metrics = self.compute_comprehensive_metrics(preds, refs)

        # Add metadata
        metrics.update({
            "model_name": model_name,
            "test_samples_count": len(samples_to_eval),
            "timestamp": datetime.now().isoformat(),
        })

        # Save results
        self.eval_results.append(metrics)
        self.save_results(f"{model_name}_evaluation.json")

        # Print summary
        self.print_summary(metrics)

        # Cleanup
        del model
        torch.cuda.empty_cache()
        gc.collect()

        return metrics

    def plot_comparison(self, base_metrics, finetuned_metrics, exp_name="comparison"):
        """Generate comparison visualizations between base and fine-tuned models"""

        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle(f'Model Comparison: Base vs Fine-tuned ({exp_name})',
                    fontsize=16, fontweight='bold')

        # 1. Standard NLP Metrics Comparison
        ax1 = axes[0, 0]
        nlp_metrics = ['rouge1_f1', 'rouge2_f1', 'rougeL_f1', 'bertscore_f1',
                      'meteor_score', 'semantic_similarity']
        nlp_labels = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BERTScore', 'METEOR', 'Sem-Sim']

        base_nlp = [base_metrics.get(m, 0) for m in nlp_metrics]
        ft_nlp = [finetuned_metrics.get(m, 0) for m in nlp_metrics]

        x = np.arange(len(nlp_labels))
        width = 0.35

        ax1.bar(x - width/2, base_nlp, width, label='Base Model', color='#FF6B6B', alpha=0.8)
        ax1.bar(x + width/2, ft_nlp, width, label='Fine-tuned', color='#4ECDC4', alpha=0.8)
        ax1.set_ylabel('Score', fontsize=11)
        ax1.set_title('Standard NLP Metrics', fontsize=12, fontweight='bold')
        ax1.set_xticks(x)
        ax1.set_xticklabels(nlp_labels, rotation=45, ha='right')
        ax1.legend()
        ax1.grid(True, alpha=0.3, axis='y')
        ax1.set_ylim([0, 1])

        # 2. Medical-Specific Metrics Comparison
        ax2 = axes[0, 1]
        medical_metrics = ['anatomical_f1', 'pathology_classification_f1',
                          'negation_handling_accuracy', 'severity_matching_accuracy',
                          'location_accuracy', 'medical_terminology_coverage',
                          'report_completeness']
        medical_labels = ['Anatomical', 'Pathology', 'Negation', 'Severity',
                         'Location', 'Terminology', 'Completeness']

        base_med = [base_metrics.get(m, 0) for m in medical_metrics]
        ft_med = [finetuned_metrics.get(m, 0) for m in medical_metrics]

        x = np.arange(len(medical_labels))

        ax2.bar(x - width/2, base_med, width, label='Base Model', color='#FF6B6B', alpha=0.8)
        ax2.bar(x + width/2, ft_med, width, label='Fine-tuned', color='#4ECDC4', alpha=0.8)
        ax2.set_ylabel('Score', fontsize=11)
        ax2.set_title('Medical-Specific Metrics', fontsize=12, fontweight='bold')
        ax2.set_xticks(x)
        ax2.set_xticklabels(medical_labels, rotation=45, ha='right')
        ax2.legend()
        ax2.grid(True, alpha=0.3, axis='y')
        ax2.set_ylim([0, 1])

        # 3. Improvement Percentage
        ax3 = axes[1, 0]
        all_metrics = nlp_metrics + medical_metrics
        all_labels = nlp_labels + medical_labels

        improvements = []
        for metric in all_metrics:
            base_val = base_metrics.get(metric, 0)
            ft_val = finetuned_metrics.get(metric, 0)
            if base_val > 0:
                improvement = ((ft_val - base_val) / base_val) * 100
            else:
                improvement = 0
            improvements.append(improvement)

        colors = ['green' if imp > 0 else 'red' for imp in improvements]
        ax3.barh(all_labels, improvements, color=colors, alpha=0.7)
        ax3.set_xlabel('Improvement (%)', fontsize=11)
        ax3.set_title('Relative Improvement Over Base Model', fontsize=12, fontweight='bold')
        ax3.axvline(x=0, color='black', linestyle='-', linewidth=0.8)
        ax3.grid(True, alpha=0.3, axis='x')

        # 4. Radar Chart for Overall Comparison
        ax4 = axes[1, 1]

        # Select key metrics for radar
        radar_metrics = ['rouge1_f1', 'bertscore_f1', 'anatomical_f1',
                        'pathology_classification_f1', 'report_completeness']
        radar_labels = ['ROUGE-1', 'BERTScore', 'Anatomical', 'Pathology', 'Completeness']

        base_radar = [base_metrics.get(m, 0) for m in radar_metrics]
        ft_radar = [finetuned_metrics.get(m, 0) for m in radar_metrics]

        angles = np.linspace(0, 2 * np.pi, len(radar_labels), endpoint=False).tolist()
        base_radar += base_radar[:1]
        ft_radar += ft_radar[:1]
        angles += angles[:1]

        ax4 = plt.subplot(2, 2, 4, projection='polar')
        ax4.plot(angles, base_radar, 'o-', linewidth=2, label='Base Model', color='#FF6B6B')
        ax4.fill(angles, base_radar, alpha=0.25, color='#FF6B6B')
        ax4.plot(angles, ft_radar, 'o-', linewidth=2, label='Fine-tuned', color='#4ECDC4')
        ax4.fill(angles, ft_radar, alpha=0.25, color='#4ECDC4')
        ax4.set_xticks(angles[:-1])
        ax4.set_xticklabels(radar_labels)
        ax4.set_ylim(0, 1)
        ax4.set_title('Overall Performance Comparison', fontsize=12, fontweight='bold', pad=20)
        ax4.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
        ax4.grid(True)

        plt.tight_layout()

        # Save figure
        output_path = self.output_dir / f"{exp_name}_model_comparison.png"
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        print(f"\n✓ Comparison plots saved to: {output_path}")
        plt.show()

        return output_path

    def save_results(self, filename):
        """Save evaluation results"""
        filepath = self.output_dir / filename
        with open(filepath, 'w') as f:
            json.dump(self.eval_results, f, indent=2)
        print(f"\n✓ Results saved to {filepath}")

    def print_summary(self, metrics):
        """Print evaluation summary"""
        print(f"\n{'='*60}")
        print("EVALUATION RESULTS")
        print(f"{'='*60}")
        print(f"Model: {metrics['model_name']}")
        print(f"Samples: {metrics['test_samples_count']}")

        print(f"\nStandard NLP Metrics:")
        print(f"  ROUGE-1 F1:       {metrics.get('rouge1_f1', 0):.4f}")
        print(f"  ROUGE-2 F1:       {metrics.get('rouge2_f1', 0):.4f}")
        print(f"  ROUGE-L F1:       {metrics.get('rougeL_f1', 0):.4f}")
        print(f"  BERTScore F1:     {metrics.get('bertscore_f1', 0):.4f}")
        print(f"  METEOR:           {metrics.get('meteor_score', 0):.4f}")
        print(f"  Semantic Sim:     {metrics.get('semantic_similarity', 0):.4f}")

        print(f"\nMedical-Specific Metrics:")
        print(f"  Anatomical F1:    {metrics.get('anatomical_f1', 0):.4f}")
        print(f"  Pathology F1:     {metrics.get('pathology_classification_f1', 0):.4f}")
        print(f"  Negation Acc:     {metrics.get('negation_handling_accuracy', 0):.4f}")
        print(f"  Severity Acc:     {metrics.get('severity_matching_accuracy', 0):.4f}")
        print(f"  Location Acc:     {metrics.get('location_accuracy', 0):.4f}")
        print(f"  Terminology Cov:  {metrics.get('medical_terminology_coverage', 0):.4f}")
        print(f"  Completeness:     {metrics.get('report_completeness', 0):.4f}")

        print(f"{'='*60}\n")


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
run_full_evaluation()

Loading test data...


README.md:   0%|          | 0.00/567 [00:00<?, ?B/s]

data/train-00000-of-00004.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

data/train-00001-of-00004.parquet:   0%|          | 0.00/188M [00:00<?, ?B/s]

data/train-00002-of-00004.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

data/train-00003-of-00004.parquet:   0%|          | 0.00/191M [00:00<?, ?B/s]

data/val-00000-of-00001.parquet:   0%|          | 0.00/109M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/215M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2069 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/296 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/590 [00:00<?, ? examples/s]


STEP 1: Evaluating BASE MODEL
==((====))==  Unsloth 2025.10.1: Fast Mllama patching. Transformers: 4.55.4.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]


Evaluating Model: base_model

Processing 0/590...
Processing 10/590...
Processing 20/590...
Processing 30/590...
Processing 40/590...
Processing 50/590...
Processing 60/590...
Processing 70/590...
Processing 80/590...
Processing 90/590...
Processing 100/590...
Processing 110/590...
Processing 120/590...
Processing 130/590...
Processing 140/590...
Processing 150/590...
Processing 160/590...
Processing 170/590...
Processing 180/590...
Processing 190/590...
Processing 200/590...
Processing 210/590...
Processing 220/590...
Processing 230/590...
Processing 240/590...
Processing 250/590...
Processing 260/590...
Processing 270/590...
Processing 280/590...
Processing 290/590...
Processing 300/590...
Processing 310/590...
Processing 320/590...
Processing 330/590...
Processing 340/590...
Processing 350/590...
