## Train T5 on Lyrics Data Only

## Import Necessary Dependencies

In [None]:
# import needed dependencies for testing PoemSum model
!pip install pytorch-lightning transformers torch

# Import needed dependencies while avoiding conflicts
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer,
    AdamW
)
import re
import os
from sklearn.model_selection import train_test_split
from typing import Dict, List
import numpy as np
from tqdm import tqdm
import gc
from transformers import get_linear_schedule_with_warmup

# attempt to view summaries briefly under code blocks
import textwrap
def print_summary(text, width=70):
    print(textwrap.fill(text, width=width))

# Evaluation model dependencies
# Install required packages
!pip install bert-score
!pip install rouge-score

from bert_score import score
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
from bert_score import score
from typing import Dict, List, Tuple

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.6.0-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.9-py3-none-any.whl.metadata (5.2 kB)
Downloading pytorch_lightning-2.4.0-py3-none-any.whl (815 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.9-py3-none-any.whl (28 kB)
Downloading torchmetrics-1.6.0-py3-none-any.whl (926 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.11.9 pytorch-lightning-2.4.0 torchmetrics-1.6.0
Collecting bert-score


## Data Preparation

This file contains the T5 model trained on the song lyrics in our dataset only.
- Pulling in song data from Cleaned Song Files


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Train on all artists

# Initialize an empty list to store DataFrames
df_list = []

# even though the folder is in "Shared with me", call "MyDrive" to pull from Cleaned Song Files
folder_path = "/content/drive/My Drive/266 Final Project/Cleaned Song Files"
# Iterate through each file in the directory
for filename in os.listdir(folder_path):
  # Check if the file is a CSV file
  if filename.endswith('.csv'):
    #Construct the full file path
    file_path = os.path.join(folder_path, filename)
    # Read the CSV file and append it to the list
    df = pd.read_csv(file_path)
    df_list.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
df = pd.concat(df_list, ignore_index=True)

In [None]:
#finding number of rows in df
print(len(df))
#printing first 3 rows
print(df.head(3))

3187
   Song ID            Title  \
0  2266961     Back To Back   
1  1865293            Can I   
2    74017  Best I Ever Had   

                                          Lyrics URL  \
0       https://genius.com/Drake-back-to-back-lyrics   
1              https://genius.com/Drake-can-i-lyrics   
2  https://genius.com/Nicki-minaj-and-drake-best-...   

                                Combined Annotations  \
0  “Back to Back” is the second of Drake’s respon...   
1  In a swiftly deleted   on October 12, 2015, Dr...   
2  “Best I Ever Had”, by Drake comes off of the 2...   

                                Wikipedia Annotation  \
0  Back to Back or backtoback may refer to Film a...   
1  Can I may refer to Can I, a 2010 album by Jaic...   
2  Best I Ever Had may refer to Best I Ever Had D...   

                                              Lyrics generated_annotation  
0  Oh man, oh man, oh man Not again Yeah, I learn...                  NaN  
1  Can I, baby? Can I, baby? Can I, baby? C

In [None]:
# Calculate average string length of the column
average_length = df['Lyrics'].str.len().mean()
min_length = df['Lyrics'].str.len().min()
max_length = df['Lyrics'].str.len().max()

# Display the result
print("Average string length of lyrics:", average_length)
print("Min string length of lyrics:", min_length)
print("Max string length of lyrics:", max_length)

Average string length of lyrics: 1456.5506551613935
Min string length of lyrics: 12.0
Max string length of lyrics: 5686.0


In [None]:
# Display the data types of each column in the DataFrame
print(df.dtypes)

Song ID                 object
Title                   object
Lyrics URL              object
Combined Annotations    object
Wikipedia Annotation    object
Lyrics                  object
generated_annotation    object
dtype: object


In [None]:
# Force conversion to string and fill NaN with empty string
df['Lyrics'] = df['Lyrics'].astype(str).fillna('')
#df['summary'] = df['summary'].astype(str).fillna('')

## Model Class Modules

In [None]:
# Custom Dataset class from PoemSum model
# CM made some minor modifications to hopefully improve efficiency

class LyricsSummaryDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        text_max_token_len: int = 1024, # CM reduced this from 2000
        summary_max_token_len: int = 512 # CM reduced from 10000
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text_encoding = self.tokenizer(
            data_row["text"],
            max_length=self.text_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        summary_encoding = self.tokenizer(
            data_row["summary"],
            max_length=self.summary_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        labels = summary_encoding["input_ids"]
        labels[labels == 0] = -100

        return dict(
            text=data_row["text"],
            summary=data_row["summary"],
            text_input_ids=text_encoding["input_ids"].flatten(),
            text_attention_mask=text_encoding["attention_mask"].flatten(),
            labels=labels.flatten(),
            labels_attention_mask=summary_encoding["attention_mask"].flatten()
        )

# Lightning Data Module from Poem Sum
class LyricsSummaryDataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        text_max_token_len: int = 1024, # CM increased from 512
        summary_max_token_len: int = 512 # CM increased from 256
    ):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def setup(self, stage=None):
        self.train_dataset = LyricsSummaryDataset(
            self.train_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )
        self.val_dataset = LyricsSummaryDataset(
            self.val_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4, # CM increased from 2
            pin_memory=True # CM added for GPU efficiency
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=4,  # CM increased from 2
            pin_memory=True # CM added for GPU efficiency
        )

# Model Class
class LyricsSummaryModel(pl.LightningModule):
  # CM added learning rate here
    def __init__(
        self,
        model_name='t5-small',
        learning_rate=1e-4,
        weight_decay=0.01, # CM added
        warmup_steps=1000, # CM added
        ):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(model_name, return_dict=True)

        # CM added calls for learning_rate, weight_decay, warmup_steps and save_hyperparameters()
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.warmup_steps = warmup_steps
        self.save_hyperparameters()

    # CM adding in calculating_copying_penalty function during fine-tuning
    def calculate_copying_penalty(self, input_ids, output_ids):
        """Calculate penalty for copying from input"""
        batch_size = input_ids.size(0)
        penalties = []

        for i in range(batch_size):
            input_text = input_ids[i].tolist()
            output_text = output_ids[i].tolist()

            # Calculate n-gram overlap
            n_gram_sizes = [2, 3, 4]
            overlap_ratios = []

            for n in n_gram_sizes:
                input_ngrams = set()
                output_ngrams = set()

                # Create n-grams for input and output
                for j in range(len(input_text) - n + 1):
                    input_ngrams.add(tuple(input_text[j:j+n]))
                for j in range(len(output_text) - n + 1):
                    output_ngrams.add(tuple(output_text[j:j+n]))

                if output_ngrams:
                    overlap = len(input_ngrams.intersection(output_ngrams))
                    overlap_ratios.append(overlap / len(output_ngrams))

            # Average overlap across different n-gram sizes
            penalties.append(sum(overlap_ratios) / len(overlap_ratios))

        return torch.tensor(sum(penalties) / batch_size, device=input_ids.device)

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        output = self.model(
            input_ids=input_ids,    # CM added = input_ids
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        loss, output = self(
            batch["text_input_ids"],
            batch["text_attention_mask"],
            batch["labels_attention_mask"],
            batch["labels"]
        )

        # CM added on_step and on_epoch statements
        self.log("train_loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)

        return loss

    def validation_step(self, batch, batch_idx):
        loss, output = self(
            batch["text_input_ids"],
            batch["text_attention_mask"],
            batch["labels_attention_mask"],
            batch["labels"]
        )

        # CM added on_step and on_epoch statements
        self.log("val_loss", loss, prog_bar=True, logger=True, on_step=True, on_epoch=True)

        # CM added to clean up memory
        del output
        torch.cuda.empty_cache()

        return loss
    # CM updated function
    def configure_optimizers(self):
        # Create optimizer
        optimizer = AdamW(
            self.parameters(),
            lr=self.learning_rate,
            weight_decay=self.weight_decay
        )

        # Create scheduler
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches
        )

        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "step"
            }
        }

## Create and Train T5 model

Modified baseline model so that we train the T5 model on lyrics data only, replacing "summary" column with "Lyrics"
- self-supervised learning
- hopefully this leads to a model that is better tailored to characteristics of song lyrics to ultimately create a model that generates better analyses/annotations

In [None]:
# added function for self-supervised learning fine-tuning
def create_target_summary(lyrics: str, max_words: int = 50) -> str:
    """Create shorter target summaries for training"""
    lines = lyrics.split('\n')
    filtered_lines = [line.strip() for line in lines if line.strip()]

    # Try to identify chorus or repeated sections
    line_counts = {}
    for line in filtered_lines:
        line_counts[line] = line_counts.get(line, 0) + 1

    # Get most repeated lines and important first/last lines
    important_lines = []

    # Add most repeated lines (likely chorus)
    repeated_lines = sorted(line_counts.items(), key=lambda x: x[1], reverse=True)
    important_lines.extend(line[0] for line in repeated_lines[:2])

    # Add first line if not already included
    if filtered_lines and filtered_lines[0] not in important_lines:
        important_lines.append(filtered_lines[0])

    # Join and truncate
    summary = ' '.join(important_lines)
    words = summary.split()
    if len(words) > max_words:
        summary = ' '.join(words[:max_words])

    return summary

In [None]:
# Do this before calling create_baseline_model to keep dedicated test set
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
def create_baseline_model(df, save_dir="checkpoints", learning_rate=1e-4): # CM added learning rate here as well
  # 1. Initialize model and tokenizer
  print("Initializing model and tokenizer...")
  MODEL_NAME = 't5-small'
  tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
  model = LyricsSummaryModel(
        MODEL_NAME,
        learning_rate=learning_rate # CM added learning_rate here
    )

  # 2. Prepare data
  print("Preparing data...")

  # CM: The original code lacked a 'summary' column, causing a KeyError.
  # We'll add a dummy 'summary' column to the DataFrame.

  # Clean data and handle NaN values
  df['Lyrics'] = df['Lyrics'].astype(str).fillna('')
  # df['summary'] = df['Lyrics'] # Using 'Lyrics' as target for self-supervised learning
  # Create shorter target summaries during training
  prepared_data = pd.DataFrame({
        'text': df.apply(
            lambda x: f"Generate a brief summary capturing the main themes: {x['Lyrics']}",
            axis=1
        ),
        'summary': df.apply(
            lambda x: ' '.join(x['Lyrics'].split()[:50]),  # Take first 50 words as target
            axis=1
        )
    })

  # 3. Split data
  train_size = int(0.8 * len(prepared_data))
  train_data = prepared_data[:train_size]
  val_data = prepared_data[train_size:]

  # 4. Set up data module
  data_module = LyricsSummaryDataModule(
      train_df=train_data,
      val_df=val_data,
      tokenizer=tokenizer,
      batch_size=4 # CM increased batch size from 2 to 4
  )

  # 5. Set up trainer
  trainer = pl.Trainer(
      max_epochs=5, # CM increased epochs from 2 to 5.
      accumulate_grad_batches=4, # CM increased effective batch size from 2 to 4.
      gradient_clip_val=1.0,
      precision=16 if torch.cuda.is_available() else 32,
      enable_checkpointing=True,
      default_root_dir=save_dir,
      # CM added callbacks and logger sections below
      callbacks=[
          # Add early stopping
          pl.callbacks.EarlyStopping(
              monitor='val_loss',
              patience=5, # CM increased patience from 3 to 5.
              mode='min'
          ),
          # Add learning rate monitoring
          pl.callbacks.LearningRateMonitor(logging_interval='step')
          ],
      logger=pl.loggers.TensorBoardLogger(
          save_dir=save_dir,
          name='lyrics_model'
          ))

  # 6. Train model
  print("Starting training...")
  trainer.fit(model, data_module)

  # 7. Save model and tokenizer
  print("Saving model and tokenizer...")
  model_path = os.path.join(save_dir, "final_model")
  os.makedirs(model_path, exist_ok=True)
  model.model.save_pretrained(model_path)
  tokenizer.save_pretrained(model_path)

  return model, tokenizer, trainer

## Run Model with Full Dataset

Testing with all data

In [None]:
# Train model on full dataset
print(f"Full dataset size: {len(df)}")
model, tokenizer, trainer = create_baseline_model(train_val_df)

# Save to Google Drive
drive_path = '/content/drive/My Drive/266 Final Project/Our Models/lyrics_model_full'
os.makedirs(drive_path, exist_ok=True)

print(f"Saving model to {drive_path}...")
model.model.save_pretrained(drive_path)
tokenizer.save_pretrained(drive_path)

# Verify the save
if os.path.exists(drive_path):
    print(f"Files saved in {drive_path}:")
    print(os.listdir(drive_path))
else:
    print("Save path not found")

Full dataset size: 3187
Initializing model and tokenizer...


/usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Preparing data...
Starting training...


INFO:pytorch_lightning.utilities.rank_zero:Loading `train_dataloader` to estimate number of stepping batches.
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M | eval
------------------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)
0         Modules in train mode
277       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


Saving model and tokenizer...
Saving model to /content/drive/My Drive/266 Final Project/Our Models/lyrics_model_full...
Files saved in /content/drive/My Drive/266 Final Project/Our Models/lyrics_model_full:
['config.json', 'generation_config.json', 'model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'spiece.model', 'tokenizer.json']


#Evaluation

In [None]:
# initialize the saved model run
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = '/content/drive/My Drive/266 Final Project/Our Models/lyrics_model_full'
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

print("Model and tokenizer loaded successfully!")

Model and tokenizer loaded successfully!


In [None]:
# Check that test_df was correctly initialized
test_df.head()
test_df.shape

(638, 7)

In [None]:
def evaluate_self_supervised_model(
   model: T5ForConditionalGeneration,
   tokenizer: T5Tokenizer,
   test_data: pd.DataFrame,
   batch_size: int = 16  # Increased batch size
) -> Tuple[Dict[str, float], List[Dict]]:
   """
   Evaluate self-supervised lyrics model with optimized processing
   """
   model.eval()
   device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
   model = model.to(device)

   evaluation_results = {
       'content_coverage': [],
       'consistency_score': [],
       'semantic_similarity': [],
       'rouge1_scores': [],
       'rouge2_scores': [],
       'rougeL_scores': [],
       'bert_scores': []
   }

   examples = []
   previous_bert_score = 0.0

   for idx in tqdm(range(0, len(test_data), batch_size)):
       batch_lyrics = test_data['Lyrics'].iloc[idx:idx + batch_size].tolist()

       # Generate multiple summaries for each lyric
       summaries_per_lyric = []
       for _ in range(2):  # Reduced from 3 to 2 summaries
           inputs = tokenizer(
               [f"summarize lyrics: {lyric}" for lyric in batch_lyrics],
               padding=True,
               truncation=True,
               max_length=512,
               return_tensors="pt"
           ).to(device)

           with torch.no_grad():
               outputs = model.generate(
                   input_ids=inputs['input_ids'],
                   attention_mask=inputs['attention_mask'],
                   max_length=100,
                   min_length=30,
                   num_beams=4,
                   do_sample=True,
                   temperature=0.3,
                   top_k=50,
                   no_repeat_ngram_size=3,
                   length_penalty=0.8,
                   repetition_penalty=1.5
               )

               decoded_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)
               summaries_per_lyric.append(decoded_summaries)

       # Evaluate each lyric's summaries
       for lyric_idx in range(len(batch_lyrics)):
           original_lyric = batch_lyrics[lyric_idx]
           lyric_summaries = [summaries[lyric_idx] for summaries in summaries_per_lyric]

           # 1. Content Coverage Score
           coverage_score = calculate_content_coverage(original_lyric, lyric_summaries[0])
           evaluation_results['content_coverage'].append(coverage_score)

           # 2. Consistency Score and ROUGE metrics
           rouge_scores = calculate_rouge_scores(lyric_summaries)
           evaluation_results['consistency_score'].append(np.mean(list(rouge_scores.values())))
           evaluation_results['rouge1_scores'].append(rouge_scores['rouge1'])
           evaluation_results['rouge2_scores'].append(rouge_scores['rouge2'])
           evaluation_results['rougeL_scores'].append(rouge_scores['rougeL'])

           # 3. Semantic Similarity
           semantic_score = calculate_semantic_similarity(original_lyric, lyric_summaries[0])
           evaluation_results['semantic_similarity'].append(semantic_score)

           # 4. BERTScore (computed less frequently)
           if lyric_idx % 8 == 0:  # Compute BERTScore less frequently
               P, R, F1 = score([lyric_summaries[0]], [original_lyric], lang='en', verbose=False)
               previous_bert_score = F1.mean().item()
               evaluation_results['bert_scores'].append(previous_bert_score)
           else:
               evaluation_results['bert_scores'].append(previous_bert_score)

           # Store examples
           if len(examples) < 5:
               examples.append({
                   'lyrics': original_lyric,
                   'generated_summaries': lyric_summaries,
                   'metrics': {
                       'content_coverage': coverage_score,
                       'consistency': np.mean(list(rouge_scores.values())),
                       'rouge1': rouge_scores['rouge1'],
                       'rouge2': rouge_scores['rouge2'],
                       'rougeL': rouge_scores['rougeL'],
                       'semantic_similarity': semantic_score,
                       'bert_score': previous_bert_score
                   }
               })

       # More frequent memory cleanup
       if idx % 5 == 0:
           torch.cuda.empty_cache()

   # Aggregate results
   metrics = {
       'avg_content_coverage': np.mean(evaluation_results['content_coverage']),
       'avg_consistency': np.mean(evaluation_results['consistency_score']),
       'avg_semantic_similarity': np.mean(evaluation_results['semantic_similarity']),
       'avg_rouge1': np.mean(evaluation_results['rouge1_scores']),
       'avg_rouge2': np.mean(evaluation_results['rouge2_scores']),
       'avg_rougeL': np.mean(evaluation_results['rougeL_scores']),
       'avg_bert_score': np.mean(evaluation_results['bert_scores'])
   }

   return metrics, examples

def calculate_rouge_scores(summaries: List[str]) -> Dict[str, float]:
   """Calculate individual ROUGE scores"""
   rouge_scorer_obj = rouge_scorer.RougeScorer(
       ['rouge1', 'rouge2', 'rougeL'],
       use_stemmer=True
   )

   scores = {
       'rouge1': [],
       'rouge2': [],
       'rougeL': []
   }

   for i in range(len(summaries)):
       for j in range(i + 1, len(summaries)):
           score = rouge_scorer_obj.score(summaries[i], summaries[j])
           scores['rouge1'].append(score['rouge1'].fmeasure)
           scores['rouge2'].append(score['rouge2'].fmeasure)
           scores['rougeL'].append(score['rougeL'].fmeasure)

   return {
       'rouge1': np.mean(scores['rouge1']),
       'rouge2': np.mean(scores['rouge2']),
       'rougeL': np.mean(scores['rougeL'])
   }

def calculate_content_coverage(lyrics: str, summary: str) -> float:
   lyrics_tokens = set(lyrics.lower().split())
   summary_tokens = set(summary.lower().split())
   overlap = len(lyrics_tokens.intersection(summary_tokens))
   coverage = overlap / len(lyrics_tokens)
   return coverage

def calculate_semantic_similarity(lyrics: str, summary: str) -> float:
   lyrics_tokens = set(lyrics.lower().split())
   summary_tokens = set(summary.lower().split())
   intersection = len(lyrics_tokens.intersection(summary_tokens))
   union = len(lyrics_tokens.union(summary_tokens))
   return intersection / union if union > 0 else 0.0

def print_evaluation_results(metrics: Dict[str, float], examples: List[Dict]):
   print("\nEvaluation Results:")
   print(f"Average Content Coverage: {metrics['avg_content_coverage']:.3f}")
   print(f"Average Consistency: {metrics['avg_consistency']:.3f}")
   print(f"Average Semantic Similarity: {metrics['avg_semantic_similarity']:.3f}")
   print(f"Average ROUGE-1: {metrics['avg_rouge1']:.3f}")
   print(f"Average ROUGE-2: {metrics['avg_rouge2']:.3f}")
   print(f"Average ROUGE-L: {metrics['avg_rougeL']:.3f}")
   print(f"Average BERTScore: {metrics['avg_bert_score']:.3f}")

   print("\nExample Generations:")
   for i, example in enumerate(examples, 1):
       print(f"\nExample {i}:")
       print(f"Original Lyrics (truncated): {example['lyrics'][:200]}...")
       print("\nGenerated Summaries:")
       for j, summary in enumerate(example['generated_summaries'], 1):
           print(f"{j}. {summary}")
       print("\nMetrics:")
       for metric, value in example['metrics'].items():
           print(f"{metric}: {value:.3f}")

In [None]:
# Run evaluation
metrics, examples = evaluate_self_supervised_model(
    model,
    tokenizer,
    test_data=test_df,  # Using your held-out test set
    batch_size=16
)

# Print results
print_evaluation_results(metrics, examples)

# If you want to see just the metrics without examples, you can do:
print("\nJust the Metrics:")
for metric_name, value in metrics.items():
    print(f"{metric_name}: {value:.3f}")

  0%|          | 0/40 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▎         | 1/40 [00:11<07:34, 11.66s/it]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not i


Evaluation Results:
Average Content Coverage: 0.412
Average Consistency: 0.990
Average Semantic Similarity: 0.404
Average ROUGE-1: 0.991
Average ROUGE-2: 0.988
Average ROUGE-L: 0.991
Average BERTScore: 0.858

Example Generations:

Example 1:
Original Lyrics (truncated): (Someone for me) 
(Someone for me) 
I'm here alone on a Friday night 
Waiting here beside the phone 
The TV, radio, and me 
Really ain't been getting along 

I wish that I could find a way 

To party ...

Generated Summaries:
1. I'm here alone on a Friday night Waiting here beside the phone The TV, radio, and me Really ain't been getting along I wish that I could find a way to party to the break of day And there I'd be sure to meet The guy that would be special to me Then momma comes and asks me Why I am dreaming, sitting alone Why not go out and have some fun It's the only way I'
2. I'm here alone on a Friday night Waiting here beside the phone The TV, radio, and me Really ain't been getting along I wish that I could 


