# Imports and Setup

In [None]:
# Install required packages
!pip install -q transformers evaluate rouge_score bert-score datasets
!pip install psutil gputil

# Import required libraries
import evaluate
from transformers import (
    PegasusTokenizer,
    PegasusForConditionalGeneration,
    PegasusConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
import torch
import pandas as pd
import numpy as np
import re
from pprint import pprint
from datasets import Dataset, Features, Value
from tqdm import tqdm
from bert_score import score
from typing import Dict, List, Tuple
import logging
from rouge_score import rouge_scorer

import gc
import psutil
import GPUtil
from typing import Optional
import os

# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Add environment configurations
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver d

In [None]:
# Setup device and mount drive
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

from google.colab import drive
drive.mount('/content/drive')

Using device: cuda
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)
os.environ["WANDB_DISABLED"] = "true"

# Memory Manager Class

In [None]:
class MemoryManager:
    """Memory monitoring and management"""
    @staticmethod
    def get_gpu_memory_info():
        try:
            gpu = GPUtil.getGPUs()[0]
            return {
                'total': gpu.memoryTotal,
                'used': gpu.memoryUsed,
                'free': gpu.memoryFree
            }
        except Exception:
            return None

    @staticmethod
    def clear_memory():
        """Aggressive memory cleanup"""
        gc.collect()
        torch.cuda.empty_cache()
        if torch.cuda.is_available():
            try:
                torch.cuda.reset_peak_memory_stats()
            except Exception:
                pass

    @staticmethod
    def check_memory_status() -> bool:
        """Check if memory usage is within safe limits"""
        if torch.cuda.is_available():
            gpu_info = MemoryManager.get_gpu_memory_info()
            if gpu_info and (gpu_info['free'] < 4000):
                return False
        return True

# Data Pre-Processor Class

In [None]:
class MultiSourceDataPreprocessor:
    def __init__(
        self,
        lyrics_folder: str,
        poem_train_path: str,
        poem_valid_path: str,
        model_name: str = "google/pegasus-cnn_dailymail"
    ):
        self.lyrics_folder = lyrics_folder
        self.poem_train_path = poem_train_path
        self.poem_valid_path = poem_valid_path
        self.model_name = model_name
        self.tokenizer = PegasusTokenizer.from_pretrained(model_name)
        logger.info("Initialized MultiSourceDataPreprocessor")

    def load_all_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
      """Load and preprocess all data sources"""
      logger.info("Loading all data sources...")
      # Load lyrics data
      lyrics_df = self._load_lyrics_data()

      # Load poem data
      poem_train_df = pd.read_csv(self.poem_train_path)
      poem_valid_df = pd.read_csv(self.poem_valid_path)

      # Clean and standardize each dataset
      lyrics_df = self._process_lyrics_data(lyrics_df)
      poem_train_df = self._process_poem_data(poem_train_df)
      poem_valid_df = self._process_poem_data(poem_valid_df)

      logger.info(f"Loaded {len(lyrics_df)} lyrics, {len(poem_train_df)} training poems, "
                  f"and {len(poem_valid_df)} validation poems")
      return lyrics_df, poem_train_df, poem_valid_df

    @staticmethod
    def _clean_text(text: str) -> str:
      """Clean and standardize text"""
      if pd.isna(text):
          return ""
      text = str(text)
      # Remove extra whitespace
      text = re.sub(r'\s+', ' ', text)
      # Remove special characters but keep basic punctuation
      text = re.sub(r'[^\w\s.,!?\'"-]', ' ', text)
      return text.strip()

    def _load_lyrics_data(self) -> pd.DataFrame:
      """Load and combine lyrics CSV files"""
      df_list = []
      for filename in os.listdir(self.lyrics_folder):
          if filename.endswith('.csv'):
              file_path = os.path.join(self.lyrics_folder, filename)
              df = pd.read_csv(file_path)
              df_list.append(df)
      return pd.concat(df_list, ignore_index=True)

    def _process_lyrics_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process lyrics dataset following T5 approach"""
        # Handle NaN and filter as in T5
        df = df.dropna(subset=['Lyrics', 'Combined Annotations'])
        df = df[df['Lyrics'].apply(lambda x: isinstance(x, str))]
        df = df[df['Combined Annotations'].apply(lambda x: isinstance(x, str))]

        # Create processed dataframe with consistent column naming
        processed_df = pd.DataFrame({
            'text': df['Lyrics'].apply(self._clean_text),
            'summary': df['Combined Annotations'].apply(self._clean_text),
            'source': 'lyrics'
        })

        return processed_df

    def _process_poem_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Process poem dataset following T5 approach"""
        # Clean and filter
        df = df.dropna(subset=['ctext', 'text'])

        # Standardize structure to match lyrics format
        processed_df = pd.DataFrame({
            'text': df['ctext'].apply(self._clean_text),
            'summary': df['text'].apply(self._clean_text),
            'source': 'poem'
        })

        return processed_df

    def prepare_combined_dataset(
        self,
        lyrics_df: pd.DataFrame,
        poem_train_df: pd.DataFrame,
        poem_valid_df: pd.DataFrame,
        test_size: float = 0.2
    ) -> Dict[str, Dataset]:
        """Prepare combined dataset with explicit sampling and type handling"""
        logger.info("Preparing combined dataset...")

        try:
        # Create copies and convert to strings
            dfs = {
                'lyrics': lyrics_df.copy(),
                'poem_train': poem_train_df.copy(),
                'poem_valid': poem_valid_df.copy()
            }

            # Process each dataframe
            for df in dfs.values():
                for col in ['text', 'summary', 'source']:
                    if col in df.columns:
                        df[col] = df[col].astype('string')  # Use string dtype instead of object

            # Split lyrics data
            lyrics_train_size = int(len(dfs['lyrics']) * 0.8)
            indices = np.arange(len(dfs['lyrics']))
            np.random.shuffle(indices)

            train_indices = indices[:lyrics_train_size]
            val_indices = indices[lyrics_train_size:]

            # Create training and validation sets
            train_df = pd.concat([
                dfs['lyrics'].iloc[train_indices],
                dfs['poem_train']
            ]).reset_index(drop=True)

            val_df = pd.concat([
                dfs['lyrics'].iloc[val_indices],
                dfs['poem_valid']
            ]).reset_index(drop=True)

            # Convert to datasets without features specification
            train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
            val_dataset = Dataset.from_pandas(val_df, preserve_index=False)

            # Apply tokenization
            tokenize_fn = self._tokenize_function
            train_dataset = train_dataset.map(
                tokenize_fn,
                batched=True,
                remove_columns=train_dataset.column_names,
                load_from_cache_file=False,
                desc="Tokenizing training data"
            )

            val_dataset = val_dataset.map(
                tokenize_fn,
                batched=True,
                remove_columns=val_dataset.column_names,
                load_from_cache_file=False,
                desc="Tokenizing validation data"
            )

            logger.info(f"Prepared {len(train_dataset)} training and {len(val_dataset)} validation examples")

            return {
                'train': train_dataset,
                'validation': val_dataset
            }


        except Exception as e:
          logger.error(f"Error in prepare_combined_dataset: {str(e)}")
          raise

    # modified for memory-efficient data processing
    def _tokenize_function(self, examples: Dict) -> Dict:
        """Memory-efficient tokenization"""
        try:
            source = examples.get("source", [])
            if isinstance(source, np.ndarray):
                source = source.tolist()

            if not isinstance(source, list):
                source = [source]

            # Process in smaller chunks
            chunk_size = 32  # Reduced from default
            texts = examples.get("text", [])
            summaries = examples.get("summary", [])

            all_inputs = []
            all_labels = []

            for i in range(0, len(texts), chunk_size):
                chunk_texts = texts[i:i + chunk_size]
                chunk_summaries = summaries[i:i + chunk_size]
                chunk_sources = source[i:i + chunk_size]

                prefixes = [
                    "summarize lyrics: " if str(s) == "lyrics" else "summarize poem: "
                    for s in chunk_sources
                ]

                tokenizer_inputs = [
                    f"{prefix}{str(text)}"
                    for prefix, text in zip(prefixes, chunk_texts)
                ]

                # Tokenize with memory-efficient settings - fixed this part
                inputs = self.tokenizer(
                    tokenizer_inputs,
                    max_length=512,
                    truncation=True,
                    padding='max_length',
                    return_tensors="pt"
                )

                labels = self.tokenizer(
                    [str(s) for s in chunk_summaries],
                    max_length=128,
                    truncation=True,
                    padding='max_length',
                    return_tensors="pt"
                )

                # Convert tensors to lists for storage
                input_dict = {
                    k: v.tolist() for k, v in inputs.items()
                }
                label_dict = {
                    k: v.tolist() for k, v in labels.items()
                }

                all_inputs.append(input_dict)
                all_labels.append(label_dict["input_ids"])

                # Clear memory
                del inputs, labels
                torch.cuda.empty_cache()

            # Combine all chunks
            final_inputs = {
                k: [item for d in all_inputs for item in d[k]]
                for k in all_inputs[0].keys()
            }
            final_inputs["labels"] = [item for sublist in all_labels for item in sublist]

            return final_inputs

        except Exception as e:
            logger.error(f"Error in tokenize_function: {e}")
            raise

# Pegasus Model Manager Class

In [None]:
class PegasusModelManager:
    def __init__(
        self,
        model_name: str = "google/pegasus-cnn_dailymail",
        output_dir: str = "models/pegasus",
        device: str = "cuda" if torch.cuda.is_available() else "cpu"
    ):
        """Initialize Pegasus model manager"""
        self.model_name = model_name
        self.output_dir = output_dir
        self.device = device
        self.model = None
        self.tokenizer = None
        logger.info(f"Initialized PegasusModelManager with {model_name}")

    # In setup_model method, add model configuration
    def setup_model(self) -> Tuple[PegasusForConditionalGeneration, PegasusTokenizer]:
        """Initialize model and tokenizer with memory optimizations"""
        logger.info(f"Loading model {self.model_name}...")

        # Configure model for memory efficiency
        model_config = PegasusConfig.from_pretrained(
            self.model_name,
            gradient_checkpointing=True,
            use_cache=False  # Disable caching during training
        )

        self.model = PegasusForConditionalGeneration.from_pretrained(
            self.model_name,
            config=model_config
        )
        self.tokenizer = PegasusTokenizer.from_pretrained(self.model_name)

        # Initialize weights that weren't loaded
        if hasattr(self.model, 'init_weights'):
          self.model.init_weights()

        if self.device == "cuda":
            self.model = self.model.to(self.device)

        logger.info("Model and tokenizer loaded successfully")
        return self.model, self.tokenizer

    def get_training_args(self) -> TrainingArguments:
        """Get ultra-conservative training arguments"""
        return TrainingArguments(
            output_dir=self.output_dir,
            evaluation_strategy="steps",
            eval_steps=2000,              # Further reduced evaluation frequency
            per_device_train_batch_size=1,  # Minimum batch size
            per_device_eval_batch_size=1,   # Minimum batch size
            gradient_accumulation_steps=16,  # Increased accumulation
            learning_rate=5e-5,
            num_train_epochs=3,
            warmup_steps=200,             # Reduced warmup
            weight_decay=0.01,
            logging_steps=500,            # Reduced logging frequency
            save_steps=4000,              # Reduced save frequency (round multiple of 2000)
            save_total_limit=1,           # Minimum saves
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            fp16=True,
            optim="adafactor",
            gradient_checkpointing=True,
            ddp_find_unused_parameters=False,
            dataloader_pin_memory=False,
            max_grad_norm=0.5,            # Added gradient clipping
            torch_compile=False,          # Disable torch compilation
            report_to="none"
        )


    def save_model_and_tokenizer(self) -> None:
        """Save model and tokenizer"""
        try:
            model_path = os.path.join(self.output_dir, "fine_tuned_pegasus")
            tokenizer_path = os.path.join(self.output_dir, "pegasus_tokenizer")

            logger.info(f"Saving model to {model_path}...")
            self.model.save_pretrained(model_path)

            logger.info(f"Saving tokenizer to {tokenizer_path}...")
            self.tokenizer.save_pretrained(tokenizer_path)

            logger.info("Model and tokenizer saved successfully!")
        except Exception as e:
            logger.error(f"Error saving model: {e}")
            raise

# Evaluator Class

In [None]:
class PegasusEvaluator:
    def __init__(
        self,
        model: PegasusForConditionalGeneration,
        tokenizer: PegasusTokenizer,
        device: str = "cuda" if torch.cuda.is_available() else "cpu"
    ):
        self.model = model.to(device)
        self.tokenizer = tokenizer
        self.device = device
        self.rouge_scorer = rouge_scorer.RougeScorer(
            ['rouge1', 'rouge2', 'rougeL'],
            use_stemmer=True
        )
        logger.info("Initialized PegasusEvaluator")

    def evaluate(
        self,
        test_data: pd.DataFrame,
        batch_size: int = 8
    ) -> Tuple[Dict[str, float], List[Dict]]:
        """Evaluate model on test data"""
        logger.info("Starting evaluation...")
        self.model.eval()
        evaluation_results = {
            'content_coverage': [],
            'semantic_similarity': [],
            'rouge1': [],
            'rouge2': [],
            'rougeL': [],
            'bert_scores': []
        }
        examples = []

        for idx in tqdm(range(0, len(test_data), batch_size), desc="Evaluating"):
            batch = test_data.iloc[idx:idx + batch_size]

            # Generate summaries
            inputs = self.tokenizer(
                batch['text'].tolist(),
                max_length=1024,
                truncation=True,
                padding=True,
                return_tensors="pt"
            ).to(self.device)

            with torch.no_grad():
                generated_ids = self.model.generate(
                    input_ids=inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],
                    max_length=256,
                    min_length=50,
                    num_beams=4,
                    length_penalty=1.0,
                    no_repeat_ngram_size=3
                )

            generated_summaries = self.tokenizer.batch_decode(
                generated_ids,
                skip_special_tokens=True
            )

            # Calculate metrics for each example
            for i, summary in enumerate(generated_summaries):
                text = batch['text'].iloc[i]
                reference = batch['summary'].iloc[i]
                metrics = self._calculate_metrics(text, summary, reference)

                for key, value in metrics.items():
                    evaluation_results[key].append(value)

                if len(examples) < 5:
                    examples.append({
                        'text': text,
                        'generated_summary': summary,
                        'reference_summary': reference,
                        'metrics': metrics
                    })

            torch.cuda.empty_cache()

        final_metrics = {
            f'avg_{key}': np.mean(values)
            for key, values in evaluation_results.items()
        }

        logger.info("Evaluation completed")
        return final_metrics, examples

    def _calculate_metrics(
        self,
        text: str,
        generated_summary: str,
        reference_summary: str
    ) -> Dict[str, float]:
        """Calculate all evaluation metrics"""
        # ROUGE scores
        rouge_scores = self.rouge_scorer.score(generated_summary, reference_summary)

        # BERTScore
        _, _, bert_f1 = score(
            [generated_summary],
            [reference_summary],
            lang='en',
            verbose=False
        )

        return {
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure,
            'rougeL': rouge_scores['rougeL'].fmeasure,
            'bert_scores': bert_f1.mean().item(),
            'content_coverage': self._calculate_content_coverage(text, generated_summary),
            'semantic_similarity': self._calculate_semantic_similarity(text, generated_summary)
        }

    @staticmethod
    def _calculate_content_coverage(text: str, summary: str) -> float:
        text_tokens = set(text.lower().split())
        summary_tokens = set(summary.lower().split())
        overlap = len(text_tokens.intersection(summary_tokens))
        return overlap / len(text_tokens) if text_tokens else 0.0

    @staticmethod
    def _calculate_semantic_similarity(text: str, summary: str) -> float:
        text_tokens = set(text.lower().split())
        summary_tokens = set(summary.lower().split())
        intersection = len(text_tokens.intersection(summary_tokens))
        union = len(text_tokens.union(summary_tokens))
        return intersection / union if union > 0 else 0.0

    def print_evaluation_results(
        self,
        metrics: Dict[str, float],
        examples: List[Dict],
        save_path: str = None
    ):
        """Print and optionally save evaluation results"""
        output = []
        output.append("\nEvaluation Results:")
        for metric, value in metrics.items():
            output.append(f"{metric}: {value:.3f}")

        output.append("\nExample Generations:")
        for i, example in enumerate(examples, 1):
            output.append(f"\nExample {i}:")
            output.append(f"Original Text (truncated): {example['text'][:200]}...")
            output.append(f"\nGenerated Summary: {example['generated_summary']}")
            output.append(f"\nReference Summary: {example['reference_summary']}")
            output.append("\nMetrics:")
            for metric, value in example['metrics'].items():
                output.append(f"{metric}: {value:.3f}")

        # Print results
        print('\n'.join(output))

        # Save results if path provided
        if save_path:
            with open(save_path, 'w') as f:
                f.write('\n'.join(output))
            logger.info(f"Results saved to {save_path}")

# Safe Trainer Class

In [None]:
# Add this class before your train_model function
class SafeTrainer(Trainer):
    """Memory-safe trainer implementation"""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.memory_manager = MemoryManager()

    def training_step(self, *args, **kwargs):
        if not self.memory_manager.check_memory_status():
            self.memory_manager.clear_memory()
        return super().training_step(*args, **kwargs)

    def evaluation_step(self, *args, **kwargs):
        if not self.memory_manager.check_memory_status():
            self.memory_manager.clear_memory()
        return super().evaluation_step(*args, **kwargs)

# Training Functions

In [None]:
def train_model(
    model: PegasusForConditionalGeneration,
    tokenizer: PegasusTokenizer,
    datasets: Dict[str, Dataset],
    training_args: TrainingArguments,
    max_attempts: int = 3
) -> Tuple[Trainer, Dict[str, float]]:
    """Train model with memory safeguards"""
    logger.info("Setting up memory-safe training...")

    for attempt in range(max_attempts):
        try:
            # Clear memory before starting
            MemoryManager.clear_memory()

            # Setup data collator with memory limits
            data_collator = DataCollatorForSeq2Seq(
                tokenizer=tokenizer,
                model=model,
                padding=True,
                max_length=512
            )

            # Add compute_metrics function here, before trainer initialization
            def compute_metrics(eval_pred):
                predictions, labels = eval_pred
                decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
                labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
                decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

                rouge_scorer_obj = rouge_scorer.RougeScorer(
                    ['rouge1', 'rouge2', 'rougeL'],
                    use_stemmer=True
                )

                rouge_scores = {
                    'rouge1': [],
                    'rouge2': [],
                    'rougeL': []
                }

                for pred, label in zip(decoded_preds, decoded_labels):
                    scores = rouge_scorer_obj.score(pred, label)
                    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
                    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
                    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

                return {
                    'rouge1': np.mean(rouge_scores['rouge1']),
                    'rouge2': np.mean(rouge_scores['rouge2']),
                    'rougeL': np.mean(rouge_scores['rougeL'])
                }

            # Initialize safe trainer
            trainer = SafeTrainer(
                model=model,
                args=training_args,
                train_dataset=datasets['train'],
                eval_dataset=datasets['validation'],
                tokenizer=tokenizer,
                data_collator=data_collator,
                compute_metrics=compute_metrics
            )

            # Train with memory monitoring
            logger.info("Starting training with memory monitoring...")
            trainer.train()

            # Final evaluation
            logger.info("Running final evaluation...")
            final_metrics = trainer.evaluate()

            return trainer, final_metrics

        except RuntimeError as e:
            if "out of memory" in str(e) and attempt < max_attempts - 1:
                logger.warning(f"OOM error in attempt {attempt + 1}, adjusting parameters...")

                # Don't let batch size go below 1
                new_batch_size = max(1, training_args.per_device_train_batch_size // 2)
                training_args.per_device_train_batch_size = new_batch_size
                training_args.per_device_eval_batch_size = new_batch_size

                # Increase gradient accumulation to compensate
                training_args.gradient_accumulation_steps *= 2

                # Additional memory optimization
                MemoryManager.clear_memory()
                if hasattr(model, 'config'):
                    model.config.use_cache = False
            else:
                raise

def run_training_pipeline():
    """Execute complete training and evaluation pipeline"""
    try:
        # 1. Initialize preprocessor and prepare data
        logger.info("Initializing preprocessor...")
        preprocessor = MultiSourceDataPreprocessor(
            lyrics_folder="drive/My Drive/266 Final Project/Cleaned Song Files",
            poem_train_path="drive/My Drive/266 Final Project/PoemSum Model/poemsum_train.csv",
            poem_valid_path="drive/My Drive/266 Final Project/PoemSum Model/poemsum_valid.csv"
        )

        # 2. Load and preprocess data
        lyrics_df, poem_train_df, poem_valid_df = preprocessor.load_all_data()
        datasets = preprocessor.prepare_combined_dataset(
            lyrics_df,
            poem_train_df,
            poem_valid_df
        )

        # 3. Initialize model manager and setup model
        logger.info("Setting up model...")
        model_manager = PegasusModelManager(
            output_dir="drive/My Drive/266 Final Project/Our Models/Pegasus"
        )
        model, tokenizer = model_manager.setup_model()

        # 4. Get training arguments
        training_args = model_manager.get_training_args()

        # 5. Train model
        trainer, training_metrics = train_model(
            model=model,
            tokenizer=tokenizer,
            datasets=datasets,
            training_args=training_args
        )

        # 6. Save model
        logger.info("Saving model...")
        model_manager.save_model_and_tokenizer()

        # 7. Set up evaluator and run evaluation
        logger.info("Running final evaluation...")
        evaluator = PegasusEvaluator(model, tokenizer)
        test_metrics, examples = evaluator.evaluate(
            test_data=lyrics_df.sample(n=50, random_state=42)
        )

        # 8. Save and print results
        results_path = os.path.join(model_manager.output_dir, "evaluation_results.txt")
        evaluator.print_evaluation_results(
            test_metrics,
            examples,
            save_path=results_path
        )

        return {
            'training_metrics': training_metrics,
            'test_metrics': test_metrics,
            'examples': examples
        }

    except Exception as e:
        logger.error(f"Error in training pipeline: {e}")
        raise
    finally:
        torch.cuda.empty_cache()

# Main Execution

In [None]:
if __name__ == "__main__":
    try:
        # Configure logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )

        # Set random seeds
        torch.manual_seed(42)
        np.random.seed(42)

        # Disable wandb
        os.environ["WANDB_DISABLED"] = "true"

        # Run complete pipeline
        logger.info("Starting training and evaluation pipeline...")
        results = run_training_pipeline()

        # Print final results
        logger.info("\nTraining and evaluation completed successfully!")
        logger.info("\nTraining Metrics:")
        pprint(results['training_metrics'])
        logger.info("\nTest Metrics:")
        pprint(results['test_metrics'])

    except Exception as e:
        logger.error(f"Error in main execution: {e}")
        raise

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Tokenizing training data:   0%|          | 0/4911 [00:00<?, ? examples/s]

Tokenizing validation data:   0%|          | 0/927 [00:00<?, ? examples/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Step,Training Loss,Validation Loss




  super().__init__(*args, **kwargs)


Step,Training Loss,Validation Loss




  super().__init__(*args, **kwargs)


Step,Training Loss,Validation Loss




ERROR:__main__:Error in training pipeline: CUDA out of memory. Tried to allocate 260.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 209.06 MiB is free. Process 137180 has 14.54 GiB memory in use. Of the allocated memory 14.32 GiB is allocated by PyTorch, and 93.78 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
ERROR:__main__:Error in main execution: CUDA out of memory. Tried to allocate 260.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 209.06 MiB is free. Process 137180 has 14.54 GiB memory in use. Of the allocated memory 14.32 GiB is allocated by PyTorch, and 93.78 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragment

OutOfMemoryError: CUDA out of memory. Tried to allocate 260.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 209.06 MiB is free. Process 137180 has 14.54 GiB memory in use. Of the allocated memory 14.32 GiB is allocated by PyTorch, and 93.78 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Old Code below

In [None]:
# 1. import dependencies
import pandas as pd
from datasets import Dataset
from transformers import PegasusTokenizer, PegasusForConditionalGeneration


In [None]:
# Train on all artists
# Initialize an empty list to store DataFrames
df_list = []

folder_path = "drive/My Drive/266 Final Project/Cleaned Song Files"
# Iterate through each file in the directory
for filename in os.listdir(folder_path):
  # Check if the file is a CSV file
  if filename.endswith('.csv'):
    # Construct the full file path
    file_path = os.path.join(folder_path, filename)
    # Read the CSV file and append it to the list
    df = pd.read_csv(file_path)
    df_list.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
df = pd.concat(df_list, ignore_index=True)

In [None]:
# Add poem code
poem_train = pd.read_csv("/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_train.csv")
poem_train = poem_train.rename(columns={'text': 'Combined Annotations', 'ctext': 'Lyrics'})
poem_train.head()
poem_valid = pd.read_csv("/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_valid.csv")
poem_valid = poem_valid.rename(columns={'text': 'Combined Annotations', 'ctext': 'Lyrics'})
poem_valid.head()

Unnamed: 0,Title,Poet,Combined Annotations,Lyrics,Poem Link
0,"Dear John, Dear Coltrane by Michael S. Harper",Michael S. Harper,"'Dear John, Dear Coltrane' by Michael S. Harpe...","a love supreme, a love supreme\na love supreme...",https://www.poetryfoundation.org/poems/42827/d...
1,Parrot by Stevie Smith,Stevie Smith,‘Parrot‘ depicts the declining health of a won...,The old sick green parrot\nHigh in a dingy cag...,https://revise.wales/pastPapers/A-level/Englis...
2,Dust of Snow by Robert Frost,Robert Frost,"The simplicity, in the end, is the key element...",The way a crow\nShook down on me\nThe dust of ...,https://www.poetryfoundation.org/poems/44262/d...
3,Suburban Sonnet by Gwen Harwood,Gwen Harwood,'Suburban Sonnet' by Gwen Harwood is a poem ab...,"She practises a fugue, though it can matter\nt...",https://genius.com/Gwen-harwood-suburban-sonne...
4,Unending Love by Rabindranath Tagore,Rabindranath Tagore,'Unending Love' by Rabindranath Tagore is a he...,"I seem to have loved you in numberless forms, ...",https://allpoetry.com/Unending-Love


In [None]:
# Load and combine files
print("Initial DataFrame:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns}")

# Step 1: Drop NaN values
df1 = df.dropna(subset=['Lyrics', 'Combined Annotations'])
print("\nAfter dropping NaN values:")
print(f"Shape: {df1.shape}")
print(f"Rows removed: {df.shape[0] - df1.shape[0]}")

# Step 2: Filter for string Lyrics
df2 = df1[df1['Lyrics'].apply(lambda x: isinstance(x, str))]
print("\nAfter filtering Lyrics for strings:")
print(f"Shape: {df2.shape}")
print(f"Rows removed: {df1.shape[0] - df2.shape[0]}")

# Print a sample of non-string Lyrics
non_string_lyrics = df1[~df1['Lyrics'].apply(lambda x: isinstance(x, str))]['Lyrics'].head()
print(f"Sample of non-string Lyrics:\n{non_string_lyrics}")

# Step 3: Filter for string Combined Annotations
df3 = df2[df2['Combined Annotations'].apply(lambda x: isinstance(x, str))]
print("\nAfter filtering Combined Annotations for strings:")
print(f"Shape: {df3.shape}")
print(f"Rows removed: {df2.shape[0] - df3.shape[0]}")

# Convert Song ID to string
df3['Song ID'] = df3['Song ID'].astype(str)

Initial DataFrame:
Shape: (3187, 7)
Columns: Index(['Song ID', 'Title', 'Lyrics URL', 'Combined Annotations',
       'Wikipedia Annotation', 'Lyrics', 'generated_annotation'],
      dtype='object')

After dropping NaN values:
Shape: (3129, 7)
Rows removed: 58

After filtering Lyrics for strings:
Shape: (3129, 7)
Rows removed: 0
Sample of non-string Lyrics:
Series([], Name: Lyrics, dtype: object)

After filtering Combined Annotations for strings:
Shape: (3129, 7)
Rows removed: 0


In [None]:
# 2. Load the tokenizer and model
model_name = "google/pegasus-cnn_dailymail"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
# 3. Tokenization function
def tokenize_function(examples):
  # Tokenize input (documents) and target (summaries)
  inputs = tokenizer(examples["Lyrics"], max_length=1024, truncation=True, padding="max_length")
  labels = tokenizer(examples["Combined Annotations"], max_length=128, truncation=True, padding="max_length")
  inputs["labels"] = labels["input_ids"]
  return inputs

In [None]:
# 4. Create and Tokenize Dataset
try:
    # Create dataset
    all_data = pd.concat([df3, poem_train, poem_valid])
    dataset = Dataset.from_pandas(df3, preserve_index=False)
    print("\nDataset created successfully")
    train_test_split = dataset.train_test_split(test_size=0.8)  # 80% training, 20% evaluation
    train_dataset = train_test_split['train']
    eval_dataset = train_test_split['test']
    print("\nDataset split into training and evaluation sets")
    train_tokenized_datasets = train_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['Song ID', 'Title', 'Lyrics URL', 'Combined Annotations',
                       'Wikipedia Annotation', 'Lyrics']
    )
    eval_tokenized_datasets = eval_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['Song ID', 'Title', 'Lyrics URL', 'Combined Annotations',
                       'Wikipedia Annotation', 'Lyrics']
    )

except Exception as e:
    print(f"\nError during dataset creation/tokenization: {str(e)}")
    if 'dataset' in locals():
        print("\nDataset info:")
        print(dataset.info())


Dataset created successfully

Dataset split into training and evaluation sets


Map:   0%|          | 0/625 [00:00<?, ? examples/s]

Map:   0%|          | 0/2504 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="google/pegasus-xsum")
os.environ["WANDB_DISABLED"] = "true"

In [None]:
logging.set_verbosity_debug()


In [None]:
# This was to make the model run successfully
train_dataset = train_dataset.select(range(2))
# eval_dataset = eval_dataset.select(range(2))
model.gradient_checkpointing_enable()

In [None]:
# 5. Define training arguments
training_args = TrainingArguments(
    output_dir="drive/My Drive/266 Final Project/Our Models/Pegasus",       # Directory to save the model checkpoints
    evaluation_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,  # Accumulate gradients to simulate larger batch size
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    report_to="none"  # Disable WandB and other integrations
)
print("Passed training")
# 6. Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=None  # Skip heavy metrics computation
)

# 7. Train the model
trainer.train()

print("Passed Training")

# 8. Save the fine-tuned model
model.save_pretrained("drive/My Drive/266 Final Project/Our Models/Pegasus/fine_tuned_pegasus")
tokenizer.save_pretrained("drive/My Drive/266 Final Project/Our Models/Pegasus/pegasus_tokenizer")

PyTorch: setting up devices


Passed training


  trainer = Trainer(
Currently training with a batch size of: 1
The following columns in the training set don't have a corresponding argument in `PegasusForConditionalGeneration.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `PegasusForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 1
  Number of trainable parameters = 568,699,904


Step,Training Loss


Saving model checkpoint to drive/My Drive/266 Final Project/Our Models/Pegasus/checkpoint-1
Configuration saved in drive/My Drive/266 Final Project/Our Models/Pegasus/checkpoint-1/config.json
Configuration saved in drive/My Drive/266 Final Project/Our Models/Pegasus/checkpoint-1/generation_config.json
Model weights saved in drive/My Drive/266 Final Project/Our Models/Pegasus/checkpoint-1/model.safetensors
tokenizer config file saved in drive/My Drive/266 Final Project/Our Models/Pegasus/checkpoint-1/tokenizer_config.json
Special tokens file saved in drive/My Drive/266 Final Project/Our Models/Pegasus/checkpoint-1/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




Passed Training


Configuration saved in drive/My Drive/266 Final Project/Our Models/Pegasus/fine_tuned_pegasus/config.json
Configuration saved in drive/My Drive/266 Final Project/Our Models/Pegasus/fine_tuned_pegasus/generation_config.json
Model weights saved in drive/My Drive/266 Final Project/Our Models/Pegasus/fine_tuned_pegasus/model.safetensors
tokenizer config file saved in drive/My Drive/266 Final Project/Our Models/Pegasus/pegasus_tokenizer/tokenizer_config.json
Special tokens file saved in drive/My Drive/266 Final Project/Our Models/Pegasus/pegasus_tokenizer/special_tokens_map.json


('drive/My Drive/266 Final Project/Our Models/Pegasus/pegasus_tokenizer/tokenizer_config.json',
 'drive/My Drive/266 Final Project/Our Models/Pegasus/pegasus_tokenizer/special_tokens_map.json',
 'drive/My Drive/266 Final Project/Our Models/Pegasus/pegasus_tokenizer/spiece.model',
 'drive/My Drive/266 Final Project/Our Models/Pegasus/pegasus_tokenizer/added_tokens.json')

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Specify the paths to your saved model and tokenizer
model_path = "drive/My Drive/266 Final Project/Our Models/Pegasus/checkpoint-918"
tokenizer_path = "drive/My Drive/266 Final Project/Our Models/Pegasus/checkpoint-918"

# Load the fine-tuned model
model = PegasusForConditionalGeneration.from_pretrained(model_path, local_files_only=True)

# Load the tokenizer
tokenizer = PegasusTokenizer.from_pretrained(tokenizer_path)

# Ensure the model is set to evaluation mode
model.eval()

# Print confirmation
print("Model and tokenizer loaded successfully!")

Model and tokenizer loaded successfully!


In [None]:
# Install required package
!pip install bert-score
!pip install rouge-score # rouge-score is the correct package name, not rouge_score.

from bert_score import score
import torch
from sklearn.model_selection import train_test_split
from typing import Dict, List, Tuple
from tqdm import tqdm
import torch
import numpy as np
from rouge_score import rouge_scorer
from transformers import PegasusForConditionalGeneration, PegasusTokenizer



# Evaluation Attempt 2

In [None]:
# Load saved model

model_path = "/content/drive/My Drive/266 Final Project/Our Models/Pegasus [OLD]/fine_tuned_pegasus"
tokenizer_path = "/content/drive/My Drive/266 Final Project/Our Models/Pegasus [OLD]/pegasus_tokenizer"
tokenizer = PegasusTokenizer.from_pretrained(tokenizer_path)
model = PegasusForConditionalGeneration.from_pretrained(model_path)

print("Model and tokenizer loaded successfully!")

# We already have our test set from data preparation
print(f"Test set size: {len(eval_dataset)}")
print(list(eval_dataset.features))

Model and tokenizer loaded successfully!
Test set size: 2504
['input_ids', 'attention_mask', 'labels']


In [None]:
from typing import Dict, List, Tuple
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from bert_score import score
from rouge_score import rouge_scorer
from torch.utils.data import Dataset

def evaluate_pegasus_model(
    model: PegasusForConditionalGeneration,
    tokenizer: PegasusTokenizer,
    test_data: pd.DataFrame,
    batch_size: int = 8  # Smaller batch size for Pegasus due to its size
) -> Tuple[Dict[str, float], List[Dict]]:
    """
    Evaluate Pegasus lyrics model comparing against reference annotations
    """
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    evaluation_results = {
        'content_coverage': [],
        'semantic_similarity': [],
        'rouge1_scores': [],
        'rouge2_scores': [],
        'rougeL_scores': [],
        'bert_scores': []
    }

    examples = []
    previous_bert_score = 0.0

    for idx in tqdm(range(0, len(test_data), batch_size)):
        batch_lyrics = test_data['Lyrics'].iloc[idx:idx + batch_size].tolist()
        batch_annotations = test_data['Combined Annotations'].iloc[idx:idx + batch_size].tolist()

        # Pegasus-specific encoding
        inputs = tokenizer(
            batch_lyrics,
            padding=True,
            truncation=True,
            max_length=1024,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=128,
                min_length=30,
                num_beams=4,
                length_penalty=0.8,
                no_repeat_ngram_size=3,
                early_stopping=True
            )

            generated_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Evaluate each summary
        for i in range(len(generated_summaries)):
            original_lyric = batch_lyrics[i]
            generated_summary = generated_summaries[i]
            reference_annotation = batch_annotations[i]

            # Calculate metrics
            coverage_score = calculate_content_coverage(original_lyric, generated_summary)
            semantic_score = calculate_semantic_similarity(original_lyric, generated_summary)
            rouge_scores = calculate_rouge_scores([generated_summary, reference_annotation])

            # Store scores
            evaluation_results['content_coverage'].append(coverage_score)
            evaluation_results['semantic_similarity'].append(semantic_score)
            evaluation_results['rouge1_scores'].append(rouge_scores['rouge1'])
            evaluation_results['rouge2_scores'].append(rouge_scores['rouge2'])
            evaluation_results['rougeL_scores'].append(rouge_scores['rougeL'])

            # Calculate BERTScore less frequently
            if i % 8 == 0:
                P, R, F1 = score([generated_summary], [reference_annotation], lang='en', verbose=False)
                previous_bert_score = F1.mean().item()
            evaluation_results['bert_scores'].append(previous_bert_score)

            # Store example generations
            if len(examples) < 5:
                examples.append({
                    'lyrics': original_lyric,
                    'reference_annotation': reference_annotation,
                    'generated_summary': generated_summary,
                    'metrics': {
                        'content_coverage': coverage_score,
                        'semantic_similarity': semantic_score,
                        'rouge1': rouge_scores['rouge1'],
                        'rouge2': rouge_scores['rouge2'],
                        'rougeL': rouge_scores['rougeL'],
                        'bert_score': previous_bert_score
                    }
                })

        # Memory management
        if idx % 4 == 0:  # More frequent cleanup for Pegasus
            torch.cuda.empty_cache()

    # Calculate average metrics
    metrics = {
        'avg_content_coverage': np.mean(evaluation_results['content_coverage']),
        'avg_semantic_similarity': np.mean(evaluation_results['semantic_similarity']),
        'avg_rouge1': np.mean(evaluation_results['rouge1_scores']),
        'avg_rouge2': np.mean(evaluation_results['rouge2_scores']),
        'avg_rougeL': np.mean(evaluation_results['rougeL_scores']),
        'avg_bert_score': np.mean(evaluation_results['bert_scores'])
    }

    return metrics, examples

def run_pegasus_evaluation(model_path: str, test_df: pd.DataFrame):
    """Run complete Pegasus evaluation pipeline"""
    tokenizer = PegasusTokenizer.from_pretrained(model_path)
    model = PegasusForConditionalGeneration.from_pretrained(model_path)
    print("Pegasus model and tokenizer loaded successfully!")

    metrics, examples = evaluate_pegasus_model(
        model,
        tokenizer,
        test_data=test_df,
        batch_size=8
    )

    print_evaluation_results(metrics, examples)
    return metrics, examples

# You can reuse these helper functions from the BART evaluation
def calculate_rouge_scores(texts: List[str]) -> Dict[str, float]:
    """Calculate ROUGE scores between texts"""
    rouge_scorer_obj = rouge_scorer.RougeScorer(
        ['rouge1', 'rouge2', 'rougeL'],
        use_stemmer=True
    )
    score = rouge_scorer_obj.score(texts[0], texts[1])
    return {
        'rouge1': score['rouge1'].fmeasure,
        'rouge2': score['rouge2'].fmeasure,
        'rougeL': score['rougeL'].fmeasure
    }

def calculate_content_coverage(lyrics: str, summary: str) -> float:
    """Calculate content coverage between lyrics and summary"""
    if isinstance(lyrics, float) or isinstance(summary, float):
        return 0.0

    try:
        lyrics_tokens = set(str(lyrics).lower().split())
        summary_tokens = set(str(summary).lower().split())
        overlap = len(lyrics_tokens.intersection(summary_tokens))
        coverage = overlap / len(lyrics_tokens) if lyrics_tokens else 0.0
        return coverage
    except Exception as e:
        print(f"Error processing lyrics/summary: {e}")
        return 0.0

def calculate_semantic_similarity(lyrics: str, summary: str) -> float:
    """Calculate semantic similarity using token overlap"""
    if isinstance(lyrics, float) or isinstance(summary, float):
        return 0.0

    try:
        lyrics_tokens = set(str(lyrics).lower().split())
        summary_tokens = set(str(summary).lower().split())
        intersection = len(lyrics_tokens.intersection(summary_tokens))
        union = len(lyrics_tokens.union(summary_tokens))
        return intersection / union if union > 0 else 0.0
    except Exception as e:
        print(f"Error processing lyrics/summary: {e}")
        return 0.0

def print_evaluation_results(metrics: Dict[str, float], examples: List[Dict]):
    """Print evaluation results and examples"""
    print("\nPegasus Model Evaluation Results:")
    print(f"Average Content Coverage: {metrics['avg_content_coverage']:.3f}")
    print(f"Average Semantic Similarity: {metrics['avg_semantic_similarity']:.3f}")
    print(f"Average ROUGE-1: {metrics['avg_rouge1']:.3f}")
    print(f"Average ROUGE-2: {metrics['avg_rouge2']:.3f}")
    print(f"Average ROUGE-L: {metrics['avg_rougeL']:.3f}")
    print(f"Average BERTScore: {metrics['avg_bert_score']:.3f}")

    print("\nExample Generations:")
    for i, example in enumerate(examples, 1):
        print(f"\nExample {i}:")
        print(f"Original Lyrics (truncated): {example['lyrics'][:200]}...")
        print(f"\nReference Annotation: {example['reference_annotation']}")
        print(f"\nGenerated Summary: {example['generated_summary']}")
        print("\nMetrics:")
        for metric, value in example['metrics'].items():
            print(f"{metric}: {value:.3f}")

In [None]:
print(eval_dataset.to_pandas().head())

   Song ID                     Title  \
0  3424957            Scorpio Rising   
1  2834074  Ladies Don’t Play Guitar   
2   329328      My Name Is Not Susan   
3    94551       Casimir Pulaski Day   
4  4752818   California Gurls (Demo)   

                                          Lyrics URL  \
0  https://genius.com/Soccer-mommy-scorpio-rising...   
1  https://genius.com/Tennis-ladies-dont-play-gui...   
2  https://genius.com/Whitney-houston-my-name-is-...   
3  https://genius.com/Sufjan-stevens-casimir-pula...   
4  https://genius.com/Katy-perry-california-gurls...   

                                Combined Annotations  \
0  The eighth track off Soccer Mommy’s   (2018), ...   
1                                                  ?   
2  “My Name Is Not Susan” was the fourth single f...   
3  This song details the events that surround the...   
4  “California Gurls (Demo)” is the demo version ...   

                                Wikipedia Annotation  \
0  No Wikipedia annotation fo

In [None]:
# accept a loaded model
def run_pegasus_evaluation(model, tokenizer, test_df: pd.DataFrame):
    """Run complete Pegasus evaluation pipeline"""
    print("Using provided Pegasus model and tokenizer")

    metrics, examples = evaluate_pegasus_model(
        model,
        tokenizer,
        test_data=test_df,
        batch_size=8
    )

    print_evaluation_results(metrics, examples)
    return metrics, examples

test_df = eval_dataset.to_pandas()

# run the evaluation
metrics, examples = run_pegasus_evaluation(
    model,
    tokenizer,
    test_df
)

Using provided Pegasus model and tokenizer


  0%|          | 0/313 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 1/313 [00:37<3:16:02, 37.70s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|          | 2/313 [00:45<1:43:25, 19.95s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|          | 3/313 [00


Pegasus Model Evaluation Results:
Average Content Coverage: 0.200
Average Semantic Similarity: 0.171
Average ROUGE-1: 0.192
Average ROUGE-2: 0.041
Average ROUGE-L: 0.124
Average BERTScore: 0.826

Example Generations:

Example 1:
Original Lyrics (truncated): Kiss you in the park
We'll meet up after dark
In your car with the backseat southern summer
Ignored all the missed calls from your mother
And kiss me in the park
We'll meet up after dark

And we'...

Reference Annotation: The eighth track off Soccer Mommy’s   (2018), “Scorpio Rising” serves as a climactic realization that the singer will lose her love to another, and it will be her fault.   described it as a song that begins “deceptively basic… [until] it starts to unfurl into this story about relinquishing a love you want badly but know you cannot maintain” and pointed out that it is “resigned and sad, yes, but it isn’t bitter.”

Generated Summary: Kiss You In The Park is a song about a young girl who is trying to find her wa


