# BERT Fine Tuning

# Import Statements

In [None]:
!pip install googletrans==4.0.0-rc1


In [None]:
!pip install pytorch-lightning
import torch
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from transformers import BartTokenizer, BartForConditionalGeneration
from torch.utils.data import DataLoader, TensorDataset
#translation and eval
from googletrans import Translator
from sklearn.model_selection import train_test_split
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import libraries
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

# Path to saved model
model_path = '/content/drive/MyDrive/266 Final Project/Our Models/BART_All_Data'

# Load tokenizer and model
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)


# Data Processing

In [None]:
# Load lyrics data
print("Loading lyrics data from Google Drive...")
df_list = []
lyrics_folder_path = "/content/drive/My Drive/266 Final Project/Song Files"
for filename in os.listdir(lyrics_folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(lyrics_folder_path, filename)
        df = pd.read_csv(file_path)
        df_list.append(df)

lyrics_df = pd.concat(df_list, ignore_index=True)

In [None]:
# Load poetry data
print("\nLoading poetry data...")
poem_list = []
poetry_files = {
    'test': "/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_test.csv",
    'train': "/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_train.csv",
    'valid': "/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_valid.csv"
}

for dataset_type, filepath in poetry_files.items():
    print(f"Loading poetry {dataset_type} dataset")
    poem_data = pd.read_csv(filepath)
    poem_list.append(poem_data)

poem_df = pd.concat(poem_list, ignore_index=True)


In [None]:
# Split datasets
print("\nSplitting datasets...")
train_val_df, test_df = train_test_split(lyrics_df, test_size=0.2, random_state=42)
train_val_poem, test_poem = train_test_split(poem_df, test_size=0.2, random_state=42)

print(f"Lyrics data split - Training+Validation: {len(train_val_df)}, Test: {len(test_df)}")
print(f"Poetry data split - Training+Validation: {len(train_val_poem)}, Test: {len(test_poem)}")

# Data Augmentation Experiment Functions

In [None]:
# Data Augmentation Functions
def backtranslate(text: str, src_lang: str = "en", tgt_lang: str = "fr") -> str:
    """Perform backtranslation using Google Translate."""
    translator = Translator()
    try:
        translated = translator.translate(text, src=src_lang, dest=tgt_lang).text
        back_translated = translator.translate(translated, src=tgt_lang, dest=src_lang).text
        return back_translated
    except Exception as e:
        print(f"Backtranslation error: {e}")
        return text

def synonym_replacement(text: str, synonym_dict: dict) -> str:
    """Replace words in text with synonyms."""
    words = text.split()
    augmented_text = " ".join([synonym_dict.get(word, word) for word in words])
    return augmented_text


# Data Module

In [None]:
class BARTDataModule(pl.LightningDataModule):
    def __init__(self, train_df, val_df=None, tokenizer=None, batch_size=16, max_length=512, augment=False):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df  # Validation data is optional
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_length = max_length
        self.augment = augment

    def setup(self, stage=None):
        if self.augment:
            synonym_dict = {"example": "sample", "text": "content"}  # Example synonym dictionary
            self.train_df['source'] = self.train_df['source'].apply(
                lambda x: backtranslate(x) if np.random.rand() < 0.3 else synonym_replacement(x, synonym_dict)
            )
        self.train_encodings = self._encode_data(self.train_df)
        if self.val_df is not None:
            self.val_encodings = self._encode_data(self.val_df)

    def _encode_data(self, df):
        df['target'] = df['target'].astype(str)  # Ensure 'target' column is of string type
        df['target'] = df['target'].apply(lambda x: x if isinstance(x, str) else str(x)) #Convert non-string values

        target_encodings = self.tokenizer(
            df['target'].tolist(),
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        source_encodings = self.tokenizer(
            df['source'].tolist(),
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        target_encodings = self.tokenizer(
            df['target'].tolist(),
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': source_encodings['input_ids'],
            'attention_mask': source_encodings['attention_mask'],
            'labels': target_encodings['input_ids']
        }

    def train_dataloader(self):
        dataset = TensorDataset(
            self.train_encodings['input_ids'],
            self.train_encodings['attention_mask'],
            self.train_encodings['labels']
        )
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        # Provide a dummy DataLoader if validation data is not available
        if self.val_df is None:
            return None
        dataset = TensorDataset(
            self.val_encodings['input_ids'],
            self.val_encodings['attention_mask'],
            self.val_encodings['labels']
        )
        return DataLoader(dataset, batch_size=self.batch_size)


# Model Definition

In [None]:
# Define Model
class BARTLitModel(pl.LightningModule):
    def __init__(self, model, learning_rate=2e-5, use_label_smoothing=False, smoothing=0.1):
        super().__init__()
        self.model = model
        self.learning_rate = learning_rate
        self.use_label_smoothing = use_label_smoothing
        self.smoothing = smoothing

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return outputs

    def label_smoothing_loss(self, logits, labels):
        # Implement label smoothing
        vocab_size = logits.size(-1)
        one_hot = torch.nn.functional.one_hot(labels, num_classes=vocab_size).float()
        smoothed_labels = one_hot * (1 - self.smoothing) + (self.smoothing / vocab_size)
        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
        loss = -(smoothed_labels * log_probs).sum(dim=-1).mean()
        return loss

    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask, labels)
        loss = outputs.loss

        # Apply label smoothing if enabled
        if self.use_label_smoothing:
            loss = self.label_smoothing_loss(outputs.logits, labels)

        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, labels = batch
        outputs = self(input_ids, attention_mask, labels)
        val_loss = outputs.loss

        self.log('val_loss', val_loss)
        return val_loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate)

# Training Functions

In [None]:
# Experiment Runner
def run_experiment(
    experiment_name,
    train_df,
    val_df,
    model,
    tokenizer,
    batch_size,
    max_length,
    learning_rate,
    drive_path,
    augment=False,
    use_label_smoothing=False
):
    print(f"Running Experiment: {experiment_name}")


    # Set up experiment directory in Google Drive
    experiment_dir = os.path.join(drive_path, experiment_name.replace(" ", "_"))
    os.makedirs(experiment_dir, exist_ok=True)

    # Initialize data module with augmentation logic
    data_module = BARTDataModule(
        train_df=train_df,
        val_df=val_df,
        tokenizer=tokenizer,
        batch_size=batch_size,
        max_length=max_length,
        augment=augment
    )

    # Initialize Lightning module with label smoothing logic
    bart_model = BARTLitModel(model=model, learning_rate=learning_rate, use_label_smoothing=use_label_smoothing)

    # Set up Trainer
    trainer = pl.Trainer(
        max_epochs=3,
        devices=1 if torch.cuda.is_available() else None,
        accelerator="gpu" if torch.cuda.is_available() else "cpu",
        gradient_clip_val=1.0,
        log_every_n_steps=10,
        default_root_dir=experiment_dir,
        enable_checkpointing=False  # Disable validation-based checkpointing
    )


    # Fine-tune the model
    trainer.fit(bart_model, datamodule=data_module)

    # Save the model and tokenizer
    model_save_path = os.path.join(experiment_dir, "fine_tuned_model")
    tokenizer_save_path = os.path.join(experiment_dir, "fine_tuned_tokenizer")
    bart_model.model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(tokenizer_save_path)

    print(f"Experiment '{experiment_name}' results saved to {experiment_dir}")

In [None]:
# Rename columns in train_val_df and test_df
train_val_df.rename(columns={"Lyrics": "source", "Combined Annotations": "target"}, inplace=True)
test_df.rename(columns={"Lyrics": "source", "Combined Annotations": "target"}, inplace=True)


In [None]:
# Check for any problematic values
print(train_val_df['source'].isnull().sum())  # Check for null values
print(train_val_df['source'].apply(lambda x: isinstance(x, str)).value_counts())  # Check if all are strings


In [None]:
def is_tokenizable(text):
    try:
        # Attempt tokenization
        tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='pt')
        return True
    except ValueError:
        return False


In [None]:
# Apply the validation function
train_val_df['is_valid'] = train_val_df['source'].apply(is_tokenizable)

# Filter out invalid rows
valid_train_val_df = train_val_df[train_val_df['is_valid']].drop(columns=['is_valid'])


In [None]:
test_df['is_valid'] = test_df['source'].apply(is_tokenizable)
valid_test_df = test_df[test_df['is_valid']].drop(columns=['is_valid'])


In [None]:
print(f"Original training rows: {len(train_val_df)}")
print(f"Valid training rows: {len(valid_train_val_df)}")
print(f"Removed rows: {len(train_val_df) - len(valid_train_val_df)}")

print(f"Original test rows: {len(test_df)}")
print(f"Valid test rows: {len(valid_test_df)}")
print(f"Removed rows: {len(test_df) - len(valid_test_df)}")


In [None]:
# Define Experiments
#could not run last experiment due to memory loss
experiments = [
    #{"name": "Base Fine-Tuning", "batch_size": 16, "max_length": 512, "learning_rate": 2e-5},
    {"name": "Hyperparameter Tuning", "batch_size": 8, "max_length": 256, "learning_rate": 5e-5},
    {"name": "Data Augmentation", "batch_size": 16, "max_length": 512, "learning_rate": 2e-5, "augment": True},
    #{"name": "Loss Function Experiment", "batch_size": 16, "max_length": 512, "learning_rate": 2e-5, "use_label_smoothing": True}
]

# Set Drive Path
drive_path = "/content/drive/MyDrive/266 Final Project/Our Models/BART Fine Tuned"

# Run Experiments
for exp in experiments:
    # Pass a subset of the training data as validation data
    train_data, val_data = train_test_split(valid_train_val_df, test_size=0.2, random_state=42)
    run_experiment(
        experiment_name=exp["name"],
        train_df=train_data,
        val_df=val_data,
        model=model,
        tokenizer=tokenizer,
        batch_size=exp["batch_size"],
        max_length=exp["max_length"],
        learning_rate=exp["learning_rate"],
        drive_path=drive_path,
        augment=exp.get("augment", False),
        use_label_smoothing=exp.get("use_label_smoothing", False)
    )

## Evaluation

In [None]:
!pip install bert_score
from transformers import BartForConditionalGeneration, BartTokenizer
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Dict, List, Tuple
from bert_score import score
from rouge_score import rouge_scorer
import torch


In [None]:
#checkinf test df
test_df.head()

In [None]:
# Check for NaN values
print("NaN values in test_df:")
print(test_df.isna().sum())

# Check data types
print("\nData types:")
print(test_df.dtypes)

# Clean the data
test_df['source'] = test_df['source'].fillna('')
test_df['target'] = test_df['target'].fillna('')

# Convert to string type
test_df['source'] = test_df['source'].astype(str)
test_df['target'] = test_df['target'].astype(str)

# Verify no empty strings that might cause issues
print("\nNumber of empty lyrics:", len(test_df[test_df['source'] == '']))
print("Number of empty annotations:", len(test_df[test_df['target'] == '']))

In [None]:
# Initial data cleanup
print("Initial data shape:", test_df.shape)

# Fill NaN values
test_df['target'] = test_df['target'].fillna('')

# Convert source and target to string type
test_df['source'] = test_df['source'].astype(str)
test_df['target'] = test_df['target'].astype(str)

# Remove rows where either source or target is empty (optional)
# test_df = test_df[test_df['source'].str.strip() != '']
# test_df = test_df[test_df['target'].str.strip() != '']

# Double check the cleaned data
print("\nAfter cleaning:")
print("Number of empty lyrics:", len(test_df[test_df['source'] == '']))
print("Number of empty annotations:", len(test_df[test_df['target'] == '']))
print("Final data shape:", test_df.shape)

# Verify a few examples
print("\nSample data check:")
print(test_df[['source', 'target']].head(2))

In [None]:
def evaluate_supervised_model(
    model: BartForConditionalGeneration,
    tokenizer: BartTokenizer,
    test_data: pd.DataFrame,
    batch_size: int = 16
) -> Tuple[Dict[str, float], List[Dict]]:
    """
    Evaluate supervised lyrics model comparing against Genius annotations using BART
    """
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    evaluation_results = {
        'content_coverage': [],
        'consistency_score': [],
        'semantic_similarity': [],
        'rouge1_scores': [],
        'rouge2_scores': [],
        'rougeL_scores': [],
        'bert_scores': []
    }

    examples = []
    previous_bert_score = 0.0

    for idx in tqdm(range(0, len(test_data), batch_size)):
        batch_lyrics = test_data['source'].iloc[idx:idx + batch_size].tolist()
        batch_annotations = test_data['target'].iloc[idx:idx + batch_size].tolist()

        # Generate summaries (BART-specific encoding)
        inputs = tokenizer(
            [f"summarize lyrics and capture meaning: {lyric}" for lyric in batch_lyrics],
            padding=True,
            truncation=True,
            max_length=1024,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=150,
                min_length=50,
                num_beams=4,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                no_repeat_ngram_size=3,
                length_penalty=1.0,
                repetition_penalty=1.2
            )

            generated_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Evaluate each summary against its reference annotation
        for i in range(len(generated_summaries)):
            original_lyric = batch_lyrics[i]
            generated_summary = generated_summaries[i]
            reference_annotation = batch_annotations[i]

            # Content Coverage (between summary and lyrics)
            coverage_score = calculate_content_coverage(original_lyric, generated_summary)
            evaluation_results['content_coverage'].append(coverage_score)

            # Semantic Similarity (between summary and lyrics)
            semantic_score = calculate_semantic_similarity(original_lyric, generated_summary)
            evaluation_results['semantic_similarity'].append(semantic_score)

            # ROUGE Scores (between generated summary and reference annotation)
            rouge_scores = calculate_rouge_scores([generated_summary, reference_annotation])
            evaluation_results['rouge1_scores'].append(rouge_scores['rouge1'])
            evaluation_results['rouge2_scores'].append(rouge_scores['rouge2'])
            evaluation_results['rougeL_scores'].append(rouge_scores['rougeL'])

            # BERTScore (between generated summary and reference annotation)
            if i % 8 == 0:  # Compute less frequently to save time
                P, R, F1 = score([generated_summary], [reference_annotation], lang='en', verbose=False)
                previous_bert_score = F1.mean().item()
            evaluation_results['bert_scores'].append(previous_bert_score)

            # Store examples
            if len(examples) < 5:
                examples.append({
                    'lyrics': original_lyric,
                    'reference_annotation': reference_annotation,
                    'generated_summary': generated_summary,
                    'metrics': {
                        'content_coverage': coverage_score,
                        'semantic_similarity': semantic_score,
                        'rouge1': rouge_scores['rouge1'],
                        'rouge2': rouge_scores['rouge2'],
                        'rougeL': rouge_scores['rougeL'],
                        'bert_score': previous_bert_score
                    }
                })

        # Memory cleanup
        if idx % 5 == 0:
            torch.cuda.empty_cache()

    # Aggregate results
    metrics = {
        'avg_content_coverage': np.mean(evaluation_results['content_coverage']),
        'avg_semantic_similarity': np.mean(evaluation_results['semantic_similarity']),
        'avg_rouge1': np.mean(evaluation_results['rouge1_scores']),
        'avg_rouge2': np.mean(evaluation_results['rouge2_scores']),
        'avg_rougeL': np.mean(evaluation_results['rougeL_scores']),
        'avg_bert_score': np.mean(evaluation_results['bert_scores'])
    }

    return metrics, examples

def calculate_rouge_scores(texts: List[str]) -> Dict[str, float]:
    """Calculate ROUGE scores between texts"""
    rouge_scorer_obj = rouge_scorer.RougeScorer(
        ['rouge1', 'rouge2', 'rougeL'],
        use_stemmer=True
    )
    score = rouge_scorer_obj.score(texts[0], texts[1])
    return {
        'rouge1': score['rouge1'].fmeasure,
        'rouge2': score['rouge2'].fmeasure,
        'rougeL': score['rougeL'].fmeasure
    }

def calculate_content_coverage(lyrics: str, summary: str) -> float:
    """Calculate content coverage between lyrics and summary"""
    if isinstance(lyrics, float) or isinstance(summary, float):
        return 0.0

    try:
        lyrics_tokens = set(str(lyrics).lower().split())
        summary_tokens = set(str(summary).lower().split())
        overlap = len(lyrics_tokens.intersection(summary_tokens))
        coverage = overlap / len(lyrics_tokens) if lyrics_tokens else 0.0
        return coverage
    except Exception as e:
        print(f"Error processing lyrics/summary: {e}")
        return 0.0

def calculate_semantic_similarity(lyrics: str, summary: str) -> float:
    """Calculate semantic similarity using token overlap"""
    if isinstance(lyrics, float) or isinstance(summary, float):
        return 0.0

    try:
        lyrics_tokens = set(str(lyrics).lower().split())
        summary_tokens = set(str(summary).lower().split())
        intersection = len(lyrics_tokens.intersection(summary_tokens))
        union = len(lyrics_tokens.union(summary_tokens))
        return intersection / union if union > 0 else 0.0
    except Exception as e:
        print(f"Error processing lyrics/summary: {e}")
        return 0.0

def print_evaluation_results(metrics: Dict[str, float], examples: List[Dict]):
    """Print evaluation results and examples"""
    print("\nEvaluation Results:")
    print(f"Average Content Coverage: {metrics['avg_content_coverage']:.3f}")
    print(f"Average Semantic Similarity: {metrics['avg_semantic_similarity']:.3f}")
    print(f"Average ROUGE-1: {metrics['avg_rouge1']:.3f}")
    print(f"Average ROUGE-2: {metrics['avg_rouge2']:.3f}")
    print(f"Average ROUGE-L: {metrics['avg_rougeL']:.3f}")
    print(f"Average BERTScore: {metrics['avg_bert_score']:.3f}")

    print("\nExample Generations:")
    for i, example in enumerate(examples, 1):
        print(f"\nExample {i}:")
        print(f"Original Lyrics (truncated): {example['lyrics'][:200]}...")
        print(f"\nReference Annotation: {example['reference_annotation']}")
        print(f"\nGenerated Summary: {example['generated_summary']}")
        print("\nMetrics:")
        for metric, value in example['metrics'].items():
            print(f"{metric}: {value:.3f}")

In [None]:
def run_evaluation(model_path: str, tokenizer_path: str, test_df: pd.DataFrame, save_dir: str):
    """
    Run evaluation and save results for a given model and tokenizer path.
    Results are saved in CSV format.
    """
    tokenizer = BartTokenizer.from_pretrained(tokenizer_path)
    model = BartForConditionalGeneration.from_pretrained(model_path)
    print(f"Model loaded from {model_path}, tokenizer loaded from {tokenizer_path}!")

    metrics, examples = evaluate_supervised_model(
        model,
        tokenizer,
        test_data=test_df,
        batch_size=16
    )

    # Create save directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)

    # Save metrics to CSV
    metrics_df = pd.DataFrame([metrics])
    metrics_file = os.path.join(save_dir, "evaluation_metrics.csv")
    metrics_df.to_csv(metrics_file, index=False)
    print(f"Saved evaluation metrics to: {metrics_file}")

    # Save examples to CSV
    examples_df = pd.DataFrame(examples)
    examples_file = os.path.join(save_dir, "evaluation_examples.csv")
    examples_df.to_csv(examples_file, index=False)
    print(f"Saved evaluation examples to: {examples_file}")

    return metrics, examples

# Evaluation code for one song

In [None]:
import os
from transformers import BartForConditionalGeneration, BartTokenizer
import pandas as pd

# Define paths
root_path = "/content/drive/MyDrive/266 Final Project/Our Models/BART Fine Tuned"
output_path = "/content/drive/MyDrive/266 Final Project/Evaluation Results"

# List of experiment directories
experiment_dirs = [
    os.path.join(root_path, "Base_Fine-Tuning"),
    os.path.join(root_path, "Data_Augmentation"),
    os.path.join(root_path, "Hyperparameter_Tuning")
]

# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)

# Assuming `test_df` is already loaded as a DataFrame
# Replace with the actual loading method if needed (e.g., pd.read_csv)

# Find song ID 141615 in the test dataset
song_id = 141615
song_row = test_df[test_df['Song ID'] == song_id]

if song_row.empty:
    print(f"Song ID {song_id} not found in the test dataset.")
else:
    lyrics = song_row.iloc[0]['Lyrics']  # Adjust 'Lyrics' column name as necessary
    print(f"Running evaluation for Song ID {song_id}...\n")

    # Iterate over experiment directories
    for experiment_dir in experiment_dirs:
        print(f"Evaluating model from: {experiment_dir}")

        # Load tokenizer and model
        tokenizer = BartTokenizer.from_pretrained(experiment_dir)
        model = BartForConditionalGeneration.from_pretrained(experiment_dir)

        # Tokenize the lyrics
        inputs = tokenizer(lyrics, return_tensors="pt", max_length=1024, truncation=True)

        # Generate annotation
        outputs = model.generate(inputs['input_ids'], max_length=150, num_beams=5, early_stopping=True)
        generated_annotation = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Print the generated annotation
        print(f"Generated Annotation for Song ID {song_id} from {os.path.basename(experiment_dir)}:\n{generated_annotation}\n")


# Evaluation code for test df

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from tqdm import tqdm
from typing import Dict, List, Tuple
from bert_score import score
from rouge_score import rouge_scorer

# Define paths
root_path = "/content/drive/MyDrive/266 Final Project/Our Models/BART Fine Tuned"
output_path = "/content/drive/MyDrive/266 Final Project/Evaluation Results"

# List of experiment directories
experiment_dirs = [
    os.path.join(root_path, "Base_Fine-Tuning"),
    os.path.join(root_path, "Data_Augmentation"),
    os.path.join(root_path, "Hyperparameter_Tuning")
]

# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)

def run_evaluation(model_path: str, tokenizer_path: str, test_df: pd.DataFrame, save_dir: str):
    """Run evaluation and save results for a given model and tokenizer path"""
    # Load model and tokenizer
    tokenizer = BartTokenizer.from_pretrained(tokenizer_path)
    model = BartForConditionalGeneration.from_pretrained(model_path)
    print(f"Model loaded from {model_path}, tokenizer loaded from {tokenizer_path}!")

    # Run evaluation
    metrics, examples = evaluate_supervised_model(
        model=model,
        tokenizer=tokenizer,
        test_data=test_df,
        batch_size=16
    )

    # Create save directory
    os.makedirs(save_dir, exist_ok=True)

    # Save metrics to CSV
    metrics_df = pd.DataFrame([metrics])
    metrics_csv_path = os.path.join(save_dir, "evaluation_metrics.csv")
    metrics_df.to_csv(metrics_csv_path, index=False)
    print(f"Metrics saved to {metrics_csv_path}")

    # Save examples to CSV
    examples_df = pd.DataFrame(examples)
    examples_csv_path = os.path.join(save_dir, "evaluation_examples.csv")
    examples_df.to_csv(examples_csv_path, index=False)
    print(f"Examples saved to {examples_csv_path}")

    return metrics, examples

# Evaluate each experiment
for experiment_dir in experiment_dirs:
    experiment_name = os.path.basename(experiment_dir)
    model_path = os.path.join(experiment_dir, "fine_tuned_model")
    tokenizer_path = os.path.join(experiment_dir, "fine_tuned_tokenizer")
    save_dir = os.path.join(output_path, experiment_name)

    print(f"\nEvaluating experiment: {experiment_name}")

    try:
        metrics, examples = run_evaluation(model_path, tokenizer_path, test_df, save_dir)
    except Exception as e:
        print(f"Error evaluating {experiment_name}: {str(e)}")
        continue