# Read in Project Dependencies

In [None]:
# import needed dependencies for testing PoemSum model
!pip install pytorch-lightning transformers torch

In [None]:
# Import needed dependencies while avoiding conflicts
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer,
    AdamW
)
import re
import os
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import textwrap
def print_summary(text, width=70):
    print(textwrap.fill(text, width=width))


# Class Modules

In [None]:
# Custom Dataset class from PoemSum model
class LyricsSummaryDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        text_max_token_len: int = 1024, # CM reduced from 2000
        summary_max_token_len: int = 10000 # CM reduced from 10000, more reasonable for summaries
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text_encoding = self.tokenizer(
            data_row["text"],
            max_length=self.text_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        summary_encoding = self.tokenizer(
            data_row["summary"],
            max_length=self.summary_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        labels = summary_encoding["input_ids"]
        labels[labels == 0] = -100

        return dict(
            text=data_row["text"],
            summary=data_row["summary"],
            text_input_ids=text_encoding["input_ids"].flatten(),
            text_attention_mask=text_encoding["attention_mask"].flatten(),
            labels=labels.flatten(),
            labels_attention_mask=summary_encoding["attention_mask"].flatten()
        )

# Lightning Data Module from Poem Sum
class LyricsSummaryDataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        text_max_token_len: int = 1024, # CM increased from 512 to 1024
        summary_max_token_len: int = 256,
        num_workers: int = 2 # CM added parameter for multi-processing
    ):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len
        self.num_workers = num_workers # CM added in addition to parameter above

    def setup(self, stage=None):
        self.train_dataset = LyricsSummaryDataset(
            self.train_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )
        self.val_dataset = LyricsSummaryDataset(
            self.val_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers, # CM added
            pin_memory=True # CM added for GPU efficiency
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers, # CM added
            pin_memory=True # CM added for GPU efficiency
        )

# Model Class
class LyricsSummaryModel(pl.LightningModule):
    def __init__(self, model_name='t5-small'):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(model_name, return_dict=True)

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        loss, outputs = self(
            batch["text_input_ids"],
            batch["text_attention_mask"],
            batch["labels_attention_mask"],
            batch["labels"]
        )
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, outputs = self(
            batch["text_input_ids"],
            batch["text_attention_mask"],
            batch["labels_attention_mask"],
            batch["labels"]
        )
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.00001)

# Data Preparation

Approach modified from previous models due to different Poem data structure.

* We maintain the original PoemSum splits (train/valid) to preserve their intended use
* Lyrics are split first before combining with poem data
* Clear labeling of input type ("lyrics" vs "poem") in the prompts
* Both lyrics and poem data are formatted consistently for the model
* Test set contains only lyrics for evaluating transfer of interpretive ability

* Added clear filtering for both lyrics and poem data
* Added data quality verification steps
* Improved status reporting with breakdowns by data type
* Added final verification check
* Maintained separate clean test set for evaluation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load lyrics data
df_list = []
folder_path = "/content/drive/My Drive/266 Final Project/Cleaned Song Files"
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        df_list.append(df)

# Concatenate lyrics data
lyrics_df = pd.concat(df_list, ignore_index=True)

In [None]:
# Load poem data keeping original splits for proper training
poem_train = pd.read_csv("/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_train.csv")
poem_valid = pd.read_csv("/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_valid.csv")
poem_test = pd.read_csv("/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_test.csv")

# Print initial dataset sizes
print(f"Lyrics dataset size: {len(lyrics_df)}")
print(f"Poem train size: {len(poem_train)}")
print(f"Poem validation size: {len(poem_valid)}")
print(f"Poem test size: {len(poem_test)}")

In [None]:
# Basic data cleaning
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [None]:
# Split lyrics data before combining with poems
train_lyrics, test_lyrics = train_test_split(lyrics_df, test_size=0.2, random_state=42)
train_lyrics, val_lyrics = train_test_split(train_lyrics, test_size=0.1, random_state=42)

In [None]:
# Clean and filter training data
train_lyrics_filtered = train_lyrics[train_lyrics['Lyrics'].notna()]
poem_train_filtered = poem_train[
    (poem_train['ctext'].notna()) &
    (poem_train['text'].notna()) &
    (poem_train['text'].str.strip() != '')
]

In [None]:
# Format training data with filtered data
train_data = pd.DataFrame({
    'text': [
        *[f"summarize lyrics and capture meaning: {clean_text(text)}" for text in train_lyrics_filtered['Lyrics']],
        *[f"summarize poem and capture meaning: {clean_text(text)}" for text in poem_train_filtered['ctext']]
    ],
    'summary': [
        *[clean_text(text) for text in train_lyrics_filtered['Lyrics']],  # Self-supervised for lyrics
        *[clean_text(text) for text in poem_train_filtered['text']]       # Summaries for poems
    ]
})

In [None]:
# Clean and filter validation data
val_lyrics_filtered = val_lyrics[val_lyrics['Lyrics'].notna()]
poem_valid_filtered = poem_valid[
    (poem_valid['ctext'].notna()) &
    (poem_valid['text'].notna()) &
    (poem_valid['text'].str.strip() != '')
]

In [None]:
# Format validation data
val_data = pd.DataFrame({
    'text': [
        *[f"summarize lyrics and capture meaning: {clean_text(text)}" for text in val_lyrics_filtered['Lyrics']],
        *[f"summarize poem and capture meaning: {clean_text(text)}" for text in poem_valid_filtered['ctext']]
    ],
    'summary': [
        *[clean_text(text) for text in val_lyrics_filtered['Lyrics']],
        *[clean_text(text) for text in poem_valid_filtered['text']]
    ]
})

In [None]:
# Keep test_lyrics separate for final evaluation
test_lyrics = test_lyrics[test_lyrics['Lyrics'].notna()].copy()
test_lyrics['Lyrics'] = test_lyrics['Lyrics'].apply(clean_text)

In [None]:
# Print final dataset statistics with more detail
print("\nProcessed Dataset Statistics:")
print(f"Training set size: {len(train_data)}")
print(f" - Lyrics: {len(train_lyrics_filtered)}")
print(f" - Poems: {len(poem_train_filtered)}")
print(f"Validation set size: {len(val_data)}")
print(f" - Lyrics: {len(val_lyrics_filtered)}")
print(f" - Poems: {len(poem_valid_filtered)}")
print(f"Test set size: {len(test_lyrics)}")

print(f"\nAverage lengths:")
print(f"Training text inputs: {train_data['text'].str.len().mean():.1f} characters")
print(f"Training summaries: {train_data['summary'].str.len().mean():.1f} characters")

print("\nQuality checks:")
print(f"Empty entries in training text: {len(train_data[train_data['text'] == ''])}")
print(f"Empty entries in training summaries: {len(train_data[train_data['summary'] == ''])}")

# Final verification of data quality
if len(train_data[train_data['text'] == '']) > 0 or len(train_data[train_data['summary'] == '']) > 0:
    print("\nWarning: Empty entries found in training data after filtering!")
else:
    print("\nData quality check passed: No empty entries in training data.")

# Create and Train t5 model

* Removed data processing steps since they're now handled in data preparation
* Accepts pre-processed train_data and val_data directly
* Increased batch size and epochs for better training
* Updated save path to commit model to drive
* Added error handling for save operations

In [None]:
def create_baseline_model(train_data, val_data, save_dir="checkpoints"):
    # 1. Initialize model and tokenizer
    print("Initializing model and tokenizer...")
    MODEL_NAME = 't5-small'
    tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
    model = LyricsSummaryModel(MODEL_NAME)

    # 2. Set up data module with pre-processed data
    print("Setting up data module...")
    data_module = LyricsSummaryDataModule(
        train_df=train_data,
        val_df=val_data,
        tokenizer=tokenizer,
        batch_size=4,  # CM increased from 2
        text_max_token_len=1024,
        summary_max_token_len=256
    )

    # 3. Set up trainer
    print("Configuring trainer...")
    trainer = pl.Trainer(
        max_epochs=3,  # Increased from 1
        accumulate_grad_batches=4,  # Increased from 2
        gradient_clip_val=1.0,
        precision=16 if torch.cuda.is_available() else 32,
        enable_checkpointing=True,
        default_root_dir=save_dir
    )

    # 4. Train model
    print("Starting training...")
    trainer.fit(model, data_module)

    # 5. Save model and tokenizer
    print("Saving model and tokenizer...")
    drive_path = '/content/drive/My Drive/266 Final Project/Our Models/Lyrics + Poem Data'
    os.makedirs(drive_path, exist_ok=True)
    try:
        model.model.save_pretrained(drive_path)
        tokenizer.save_pretrained(drive_path)
        print(f"Model and tokenizer successfully saved to {drive_path}")
    except Exception as e:
        print(f"Failed to save model and tokenizer: {e}")

    return model, tokenizer, trainer

In [None]:
# Train model using pre-processed data
model, tokenizer, trainer = create_baseline_model(train_data, val_data)

# (SKIP) Generate Song Summary

In [None]:
# Function to generate a summary for a single song
def generate_song_summary(model, tokenizer, data, song_index, max_length=150):
    """Generate a summary for a single song"""

    # Prepare input text
    # input_text = f"summarize lyrics and capture meaning: {data.iloc[song_index]['Lyrics']} Title: {data.iloc[song_index]['Title']}"

    # Create the input text with both lyrics and annotation
    input_text = (
      f"summarize the meaning of the song lyrics: {data.iloc[song_index]['Lyrics']} \n"
      f"context: incorporate relevant details from: {data.iloc[song_index]['generated_annotation']}" # context should not have annotation input, start with lyrics only, context should be wiki data
    )


    # Encode the text
    inputs = tokenizer.encode(
        input_text,
        max_length=5000,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # Generate summary
    summary_ids = model.model.generate(
        inputs,
        max_length=300, #
        min_length=100,
        num_beams=5,          # consider reducing to 2-3; switch from beam search to top p sampling (help model not stick to abs max; recent class notebooks have doc on parameters)
        #temperature=0.9,      # Temperature controls randomness; slightly lower encourages more focus
        length_penalty=0.5,   # Reduce penalty if output is too short or cut off
        early_stopping=True,
        no_repeat_ngram_size=2 # To prevent repetition; 2 is restrictive
    )

    # Decode summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

- Version with lyrics only
- How much does it improve when we add Wiki data?
- Try experimenting with .generate parameters
- Three approaches
  - Lyrics
  - Lyrics + Wiki

- Goal involves summarization and commentary
  - need label data to reflect goal
  - challenge is data formatting in available label data
  - does using Wiki improve output
  - real data is slightly different from our goal
  - the model will train on this data, so won't learn our exact goal
  - framing challenge


In [None]:
# Usage example:
"""
generate_song_summary(model, tokenizer, df, song_index=0)
"""

summary = generate_song_summary(model, tokenizer, df, song_index=1)

print(df.iloc[1]['Title'])
print_summary(summary)


In [None]:
print_summary(df.iloc[1]['generated_annotation'])


#Evaluation

The main differences from the approach used in Lyrics+Genius model are:

* Removed comparison to annotations since we're evaluating interpretive ability
* Added metrics specific to summary quality (length, vocabulary diversity)
* Simplified evaluation to focus on the model's ability to generate meaningful summaries
* Modified output format to match new evaluation approach

In [None]:
# Load saved model
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = '/content/drive/My Drive/266 Final Project/Our Models/Lyrics + Poem Data'
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

print("Model and tokenizer loaded successfully!")

# We already have our test_lyrics from data preparation
print(f"Test set size: {len(test_lyrics)}")
print(test_lyrics.columns.tolist())

In [None]:
# Install required packages
!pip install -q bert-score rouge-score

import torch
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
from transformers import BartTokenizer, BartForConditionalGeneration
from rouge_score import rouge_scorer
from bert_score import score
from tqdm import tqdm

def evaluate_supervised_model(
    model: T5ForConditionalGeneration,
    tokenizer: T5Tokenizer,
    test_data: pd.DataFrame,
    batch_size: int = 16
) -> Tuple[Dict[str, float], List[Dict]]:
    """
    Evaluate supervised lyrics model comparing against Genius annotations
    """
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    evaluation_results = {
        'content_coverage': [],
        'consistency_score': [],
        'semantic_similarity': [],
        'rouge1_scores': [],
        'rouge2_scores': [],
        'rougeL_scores': [],
        'bert_scores': []
    }

    examples = []
    previous_bert_score = 0.0

    for idx in tqdm(range(0, len(test_data), batch_size)):
        batch_lyrics = test_data['Lyrics'].iloc[idx:idx + batch_size].tolist()
        batch_annotations = test_data['Combined Annotations'].iloc[idx:idx + batch_size].tolist()

        # Generate summaries
        inputs = tokenizer(
            [f"summarize lyrics and capture meaning: {lyric}" for lyric in batch_lyrics],
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=150,
                min_length=50,
                num_beams=4,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                no_repeat_ngram_size=3,
                length_penalty=1.0,
                repetition_penalty=1.2
            )

            generated_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Evaluate each summary against its reference annotation
        for i in range(len(generated_summaries)):
            original_lyric = batch_lyrics[i]
            generated_summary = generated_summaries[i]
            reference_annotation = batch_annotations[i]

            # Content Coverage (between summary and lyrics)
            coverage_score = calculate_content_coverage(original_lyric, generated_summary)
            evaluation_results['content_coverage'].append(coverage_score)

            # Semantic Similarity (between summary and lyrics)
            semantic_score = calculate_semantic_similarity(original_lyric, generated_summary)
            evaluation_results['semantic_similarity'].append(semantic_score)

            # ROUGE Scores (between generated summary and reference annotation)
            rouge_scores = calculate_rouge_scores([generated_summary, reference_annotation])
            evaluation_results['rouge1_scores'].append(rouge_scores['rouge1'])
            evaluation_results['rouge2_scores'].append(rouge_scores['rouge2'])
            evaluation_results['rougeL_scores'].append(rouge_scores['rougeL'])

            # BERTScore (between generated summary and reference annotation)
            if i % 8 == 0:  # Compute less frequently to save time
                P, R, F1 = score([generated_summary], [reference_annotation], lang='en', verbose=False)
                previous_bert_score = F1.mean().item()
            evaluation_results['bert_scores'].append(previous_bert_score)

            # Store examples
            if len(examples) < 5:
                examples.append({
                    'lyrics': original_lyric,
                    'reference_annotation': reference_annotation,
                    'generated_summary': generated_summary,
                    'metrics': {
                        'content_coverage': coverage_score,
                        'semantic_similarity': semantic_score,
                        'rouge1': rouge_scores['rouge1'],
                        'rouge2': rouge_scores['rouge2'],
                        'rougeL': rouge_scores['rougeL'],
                        'bert_score': previous_bert_score
                    }
                })

        # Memory cleanup
        if idx % 5 == 0:
            torch.cuda.empty_cache()

    # Aggregate results
    metrics = {
        'avg_content_coverage': np.mean(evaluation_results['content_coverage']),
        'avg_semantic_similarity': np.mean(evaluation_results['semantic_similarity']),
        'avg_rouge1': np.mean(evaluation_results['rouge1_scores']),
        'avg_rouge2': np.mean(evaluation_results['rouge2_scores']),
        'avg_rougeL': np.mean(evaluation_results['rougeL_scores']),
        'avg_bert_score': np.mean(evaluation_results['bert_scores'])
    }

    return metrics, examples

def calculate_rouge_scores(texts: List[str]) -> Dict[str, float]:
    """Calculate ROUGE scores between texts"""
    rouge_scorer_obj = rouge_scorer.RougeScorer(
        ['rouge1', 'rouge2', 'rougeL'],
        use_stemmer=True
    )

    # For supervised evaluation, compare generated summary to reference
    score = rouge_scorer_obj.score(texts[0], texts[1])

    return {
        'rouge1': score['rouge1'].fmeasure,
        'rouge2': score['rouge2'].fmeasure,
        'rougeL': score['rougeL'].fmeasure
    }

def calculate_content_coverage(lyrics: str, summary: str) -> float:
    """Calculate content coverage between lyrics and summary"""
    # Handle NaN or float values
    if isinstance(lyrics, float) or isinstance(summary, float):
        return 0.0

    try:
        lyrics_tokens = set(str(lyrics).lower().split())
        summary_tokens = set(str(summary).lower().split())
        overlap = len(lyrics_tokens.intersection(summary_tokens))
        coverage = overlap / len(lyrics_tokens) if lyrics_tokens else 0.0
        return coverage
    except Exception as e:
        print(f"Error processing lyrics/summary: {e}")
        return 0.0

def calculate_semantic_similarity(lyrics: str, summary: str) -> float:
    """Calculate semantic similarity using token overlap"""
    # Handle NaN or float values
    if isinstance(lyrics, float) or isinstance(summary, float):
        return 0.0

    try:
        lyrics_tokens = set(str(lyrics).lower().split())
        summary_tokens = set(str(summary).lower().split())
        intersection = len(lyrics_tokens.intersection(summary_tokens))
        union = len(lyrics_tokens.union(summary_tokens))
        return intersection / union if union > 0 else 0.0
    except Exception as e:
        print(f"Error processing lyrics/summary: {e}")
        return 0.0

def print_evaluation_results(metrics: Dict[str, float], examples: List[Dict]):
    """Print evaluation results and examples"""
    print("\nEvaluation Results:")
    print(f"Average Content Coverage: {metrics['avg_content_coverage']:.3f}")
    print(f"Average Semantic Similarity: {metrics['avg_semantic_similarity']:.3f}")
    print(f"Average ROUGE-1: {metrics['avg_rouge1']:.3f}")
    print(f"Average ROUGE-2: {metrics['avg_rouge2']:.3f}")
    print(f"Average ROUGE-L: {metrics['avg_rougeL']:.3f}")
    print(f"Average BERTScore: {metrics['avg_bert_score']:.3f}")

    print("\nExample Generations:")
    for i, example in enumerate(examples, 1):
        print(f"\nExample {i}:")
        print(f"Original Lyrics (truncated): {example['lyrics'][:200]}...")
        print(f"\nReference Annotation: {example['reference_annotation']}")
        print(f"\nGenerated Summary: {example['generated_summary']}")
        print("\nMetrics:")
        for metric, value in example['metrics'].items():
            print(f"{metric}: {value:.3f}")

# Usage example:
def run_evaluation(model_path: str, test_df: pd.DataFrame):
    """Run complete evaluation pipeline"""
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    print("Model and tokenizer loaded successfully!")

    metrics, examples = evaluate_supervised_model(
        model,
        tokenizer,
        test_data=test_df,
        batch_size=16
    )

    print_evaluation_results(metrics, examples)
    return metrics, examples

In [None]:
metrics, examples = run_evaluation(model_path, test_lyrics)

## OLD PROCESS BELOW

In [None]:
# call saved model
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = '/content/drive/My Drive/266 Final Project/Our Models/Lyrics + Poem Data'
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

print("Model and tokenizer loaded successfully!")

In [None]:
# Check that test_df was correctly initialized
print(test_df.shape)
print(test_df.columns)
print(test_df.head())

In [None]:
# Check for NaN values
print("NaN values in test_df:")
print(test_df.isna().sum())

# Check data types
print("\nData types:")
print(test_df.dtypes)

# Clean the data
test_df['Lyrics'] = test_df['Lyrics'].fillna('')
test_df['Combined Annotations'] = test_df['Combined Annotations'].fillna('')

# Convert to string type
test_df['Lyrics'] = test_df['Lyrics'].astype(str)
test_df['Combined Annotations'] = test_df['Combined Annotations'].astype(str)

# Verify no empty strings that might cause issues
print("\nNumber of empty lyrics:", len(test_df[test_df['Lyrics'] == '']))
print("Number of empty annotations:", len(test_df[test_df['Combined Annotations'] == '']))

In [None]:
# Install required package
!pip install bert-score
!pip install rouge-score

import torch
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
from transformers import T5Tokenizer
from rouge_score import rouge_scorer
from bert_score import score
from tqdm import tqdm

from bert_score import score
import torch
from sklearn.model_selection import train_test_split

In [None]:
def evaluate_supervised_model(
    model: T5ForConditionalGeneration,
    tokenizer: T5Tokenizer,
    test_data: pd.DataFrame,
    batch_size: int = 16
) -> Tuple[Dict[str, float], List[Dict]]:
    """
    Evaluate supervised lyrics model comparing against Genius annotations
    """
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    evaluation_results = {
        'content_coverage': [],
        'consistency_score': [],
        'semantic_similarity': [],
        'rouge1_scores': [],
        'rouge2_scores': [],
        'rougeL_scores': [],
        'bert_scores': []
    }

    examples = []
    previous_bert_score = 0.0

    for idx in tqdm(range(0, len(test_data), batch_size)):
        batch_lyrics = test_data['Lyrics'].iloc[idx:idx + batch_size].tolist()
        batch_annotations = test_data['Combined Annotations'].iloc[idx:idx + batch_size].tolist()

        # Generate summaries
        inputs = tokenizer(
            [f"summarize lyrics and capture meaning: {lyric}" for lyric in batch_lyrics],
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=150,
                min_length=50,
                num_beams=4,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                no_repeat_ngram_size=3,
                length_penalty=1.0,
                repetition_penalty=1.2
            )

            generated_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        # Evaluate each summary against its reference annotation
        for i in range(len(generated_summaries)):
            original_lyric = batch_lyrics[i]
            generated_summary = generated_summaries[i]
            reference_annotation = batch_annotations[i]

            # Content Coverage (between summary and lyrics)
            coverage_score = calculate_content_coverage(original_lyric, generated_summary)
            evaluation_results['content_coverage'].append(coverage_score)

            # Semantic Similarity (between summary and lyrics)
            semantic_score = calculate_semantic_similarity(original_lyric, generated_summary)
            evaluation_results['semantic_similarity'].append(semantic_score)

            # ROUGE Scores (between generated summary and reference annotation)
            rouge_scores = calculate_rouge_scores([generated_summary, reference_annotation])
            evaluation_results['rouge1_scores'].append(rouge_scores['rouge1'])
            evaluation_results['rouge2_scores'].append(rouge_scores['rouge2'])
            evaluation_results['rougeL_scores'].append(rouge_scores['rougeL'])

            # BERTScore (between generated summary and reference annotation)
            if i % 8 == 0:  # Compute less frequently to save time
                P, R, F1 = score([generated_summary], [reference_annotation], lang='en', verbose=False)
                previous_bert_score = F1.mean().item()
            evaluation_results['bert_scores'].append(previous_bert_score)

            # Store examples
            if len(examples) < 5:
                examples.append({
                    'lyrics': original_lyric,
                    'reference_annotation': reference_annotation,
                    'generated_summary': generated_summary,
                    'metrics': {
                        'content_coverage': coverage_score,
                        'semantic_similarity': semantic_score,
                        'rouge1': rouge_scores['rouge1'],
                        'rouge2': rouge_scores['rouge2'],
                        'rougeL': rouge_scores['rougeL'],
                        'bert_score': previous_bert_score
                    }
                })

        # Memory cleanup
        if idx % 5 == 0:
            torch.cuda.empty_cache()

    # Aggregate results
    metrics = {
        'avg_content_coverage': np.mean(evaluation_results['content_coverage']),
        'avg_semantic_similarity': np.mean(evaluation_results['semantic_similarity']),
        'avg_rouge1': np.mean(evaluation_results['rouge1_scores']),
        'avg_rouge2': np.mean(evaluation_results['rouge2_scores']),
        'avg_rougeL': np.mean(evaluation_results['rougeL_scores']),
        'avg_bert_score': np.mean(evaluation_results['bert_scores'])
    }

    return metrics, examples

def calculate_rouge_scores(texts: List[str]) -> Dict[str, float]:
    """Calculate ROUGE scores between texts"""
    rouge_scorer_obj = rouge_scorer.RougeScorer(
        ['rouge1', 'rouge2', 'rougeL'],
        use_stemmer=True
    )

    # For supervised evaluation, compare generated summary to reference
    score = rouge_scorer_obj.score(texts[0], texts[1])

    return {
        'rouge1': score['rouge1'].fmeasure,
        'rouge2': score['rouge2'].fmeasure,
        'rougeL': score['rougeL'].fmeasure
    }

def calculate_content_coverage(lyrics: str, summary: str) -> float:
    """Calculate content coverage between lyrics and summary"""
    # Handle NaN or float values
    if isinstance(lyrics, float) or isinstance(summary, float):
        return 0.0

    try:
        lyrics_tokens = set(str(lyrics).lower().split())
        summary_tokens = set(str(summary).lower().split())
        overlap = len(lyrics_tokens.intersection(summary_tokens))
        coverage = overlap / len(lyrics_tokens) if lyrics_tokens else 0.0
        return coverage
    except Exception as e:
        print(f"Error processing lyrics/summary: {e}")
        return 0.0

def calculate_semantic_similarity(lyrics: str, summary: str) -> float:
    """Calculate semantic similarity using token overlap"""
    # Handle NaN or float values
    if isinstance(lyrics, float) or isinstance(summary, float):
        return 0.0

    try:
        lyrics_tokens = set(str(lyrics).lower().split())
        summary_tokens = set(str(summary).lower().split())
        intersection = len(lyrics_tokens.intersection(summary_tokens))
        union = len(lyrics_tokens.union(summary_tokens))
        return intersection / union if union > 0 else 0.0
    except Exception as e:
        print(f"Error processing lyrics/summary: {e}")
        return 0.0

def print_evaluation_results(metrics: Dict[str, float], examples: List[Dict]):
    """Print evaluation results and examples"""
    print("\nEvaluation Results:")
    print(f"Average Content Coverage: {metrics['avg_content_coverage']:.3f}")
    print(f"Average Semantic Similarity: {metrics['avg_semantic_similarity']:.3f}")
    print(f"Average ROUGE-1: {metrics['avg_rouge1']:.3f}")
    print(f"Average ROUGE-2: {metrics['avg_rouge2']:.3f}")
    print(f"Average ROUGE-L: {metrics['avg_rougeL']:.3f}")
    print(f"Average BERTScore: {metrics['avg_bert_score']:.3f}")

    print("\nExample Generations:")
    for i, example in enumerate(examples, 1):
        print(f"\nExample {i}:")
        print(f"Original Lyrics (truncated): {example['lyrics'][:200]}...")
        print(f"\nReference Annotation: {example['reference_annotation']}")
        print(f"\nGenerated Summary: {example['generated_summary']}")
        print("\nMetrics:")
        for metric, value in example['metrics'].items():
            print(f"{metric}: {value:.3f}")

# Usage example:
def run_evaluation(model_path: str, test_df: pd.DataFrame):
    """Run complete evaluation pipeline"""
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    print("Model and tokenizer loaded successfully!")

    metrics, examples = evaluate_supervised_model(
        model,
        tokenizer,
        test_data=test_df,
        batch_size=16
    )

    print_evaluation_results(metrics, examples)
    return metrics, examples

In [None]:
# Run the evaluation
metrics, examples = run_evaluation(model_path, test_df)