# T5-Small trained on the following data


1.   Genius Annotations
2.   Lyrics
3.   PoemSum Data



# Install Required Dependencies

In [None]:
# import needed dependencies for testing PoemSum model
!pip install pytorch-lightning transformers torch

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.6.0-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.9-py3-none-any.whl.metadata (5.2 kB)
Downloading pytorch_lightning-2.4.0-py3-none-any.whl (815 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.9-py3-none-any.whl (28 kB)
Downloading torchmetrics-1.6.0-py3-none-any.whl (926 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning-utilities, torchmetrics, pytorch-lightning
Successfully installed lightning-utilities-0.11.9 pytorch-lightning-2.4.0 torchmetrics-1.6.0


In [None]:
# Import needed dependencies while avoiding conflicts
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer,
    AdamW
)
import re
import os
from sklearn.model_selection import train_test_split

In [None]:
import textwrap
def print_summary(text, width=70):
    print(textwrap.fill(text, width=width))


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Class Modules
This step prepares data, training, and validating a text-to-text Transformer model for summarizing song lyrics.

In [None]:
# Custom Dataset class from PoemSum model
class LyricsSummaryDataset(Dataset):
    """
    Custom PyTorch Dataset for handling lyrics and their summaries.

    Parameters:
    - data (pd.DataFrame): The dataset containing text and summary columns.
    - tokenizer (T5Tokenizer): The tokenizer to preprocess text and summary.
    - text_max_token_len (int): Maximum token length for text inputs.
    - summary_max_token_len (int): Maximum token length for summary inputs.
    """
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        text_max_token_len: int = 1024,
        summary_max_token_len: int = 256
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def __len__(self):
        """Returns the total number of rows in the dataset."""
        return len(self.data)

    def __getitem__(self, index: int):
        """
        Retrieves a single data row and preprocesses it for the model.

        Parameters:
        - index (int): The index of the row to retrieve.

        Returns:
        - dict: A dictionary containing preprocessed inputs and labels.
        """
        data_row = self.data.iloc[index]

        # Tokenize the text input
        text_encoding = self.tokenizer(
            data_row["text"],
            max_length=self.text_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        # Tokenize the summary input
        summary_encoding = self.tokenizer(
            data_row["summary"],
            max_length=self.summary_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        # Replace padding token IDs (0) in labels with -100 for loss calculation
        labels = summary_encoding["input_ids"]
        labels[labels == 0] = -100

        return dict(
            text=data_row["text"],
            summary=data_row["summary"],
            text_input_ids=text_encoding["input_ids"].flatten(),
            text_attention_mask=text_encoding["attention_mask"].flatten(),
            labels=labels.flatten(),
            labels_attention_mask=summary_encoding["attention_mask"].flatten()
        )

# Lightning Data Module from PoemSum
class LyricsSummaryDataModule(pl.LightningDataModule):
    """
    LightningDataModule for preparing data loaders for training and validation.

    Parameters:
    - train_df (pd.DataFrame): DataFrame for the training dataset.
    - val_df (pd.DataFrame): DataFrame for the validation dataset.
    - tokenizer (T5Tokenizer): The tokenizer for preprocessing.
    - batch_size (int): Number of samples per batch.
    - text_max_token_len (int): Maximum token length for text inputs.
    - summary_max_token_len (int): Maximum token length for summary inputs.
    """
    def __init__(
        self,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        text_max_token_len: int = 512,
        summary_max_token_len: int = 256
    ):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def setup(self, stage=None):
        """
        Initializes the training and validation datasets.

        Parameters:
        - stage (str): Stage of data preparation (unused in this case).
        """
        self.train_dataset = LyricsSummaryDataset(
            self.train_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )
        self.val_dataset = LyricsSummaryDataset(
            self.val_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )

    def train_dataloader(self):
        """Creates a DataLoader for the training dataset."""
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=2
        )

    def val_dataloader(self):
        """Creates a DataLoader for the validation dataset."""
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=2
        )

# Model Class
class LyricsSummaryModel(pl.LightningModule):
    """
    PyTorch Lightning model for fine-tuning T5 for text summarization.

    Parameters:
    - model_name (str): Pretrained model name or path from Hugging Face.
    """
    def __init__(self, model_name='t5-small'):
        super().__init__()
        # Use T5ForConditionalGeneration, which has the required generation capabilities
        self.model = T5ForConditionalGeneration.from_pretrained(model_name, return_dict=True)

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        """
        Forward pass through the model.

        Parameters:
        - input_ids (torch.Tensor): Tokenized input IDs.
        - attention_mask (torch.Tensor): Attention mask for input.
        - decoder_attention_mask (torch.Tensor): Attention mask for decoder input.
        - labels (torch.Tensor): Tokenized labels (optional).

        Returns:
        - tuple: Loss and logits from the model output.
        """
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        """
        Performs a training step.

        Parameters:
        - batch (dict): Batch of data containing inputs and labels.
        - batch_idx (int): Index of the batch.

        Returns:
        - torch.Tensor: Training loss.
        """
        loss, outputs = self(
            batch["text_input_ids"],
            batch["text_attention_mask"],
            batch["labels_attention_mask"],
            batch["labels"]
        )
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        """
        Performs a validation step.

        Parameters:
        - batch (dict): Batch of data containing inputs and labels.
        - batch_idx (int): Index of the batch.

        Returns:
        - torch.Tensor: Validation loss.
        """
        loss, outputs = self(
            batch["text_input_ids"],
            batch["text_attention_mask"],
            batch["labels_attention_mask"],
            batch["labels"]
        )
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        """
        Configures the optimizer for the model.

        Returns:
        - torch.optim.Optimizer: AdamW optimizer with a predefined learning rate.
        """
        return AdamW(self.parameters(), lr=0.00001)


# Data Preparation

* Updated token lengths to handle combined data better
* More robust data cleaning and filtering
* More explicit handling of multiple data sources
* Added early stopping to training
* Updated save path for combined model
* Better error handling throughout

We don't include the poem test set because we're only interested in using the poem data (train and validation sets) for training the model to learn interpretive summarization patterns. The test set will only contain lyrics data since our end goal is to evaluate how well the model can summarize and interpret song lyrics.
This is different from Model 3 (Lyrics + Poem) where we loaded all poem data initially but then only used train/valid splits. In this combined model, we're being more explicit about:

Using poem train/valid data only for training purposes
Using lyrics test set for final evaluation
Keeping our evaluation focused on the model's performance on lyrics, which is our target domain

In [None]:
# Load lyrics data
df_list = []
folder_path = "/content/drive/My Drive/266 Final Project/Cleaned Song Files"
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        df_list.append(df)

# Concatenate lyrics data
lyrics_df = pd.concat(df_list, ignore_index=True)

In [None]:
# Load poem data keeping original splits
poem_train = pd.read_csv("/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_train.csv")
poem_valid = pd.read_csv("/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_valid.csv")

# Basic data cleaning
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [None]:
# Split lyrics data
train_lyrics, test_lyrics = train_test_split(lyrics_df, test_size=0.2, random_state=42)
train_lyrics, val_lyrics = train_test_split(train_lyrics, test_size=0.1, random_state=42)

In [None]:
# Clean and filter training data
train_lyrics_filtered = train_lyrics[
    (train_lyrics['Lyrics'].notna()) &
    (train_lyrics['Combined Annotations'].notna())
]
poem_train_filtered = poem_train[
    (poem_train['ctext'].notna()) &
    (poem_train['text'].notna()) &
    (poem_train['text'].str.strip() != '')
]

# Clean and filter validation data
val_lyrics_filtered = val_lyrics[
    (val_lyrics['Lyrics'].notna()) &
    (val_lyrics['Combined Annotations'].notna())
]
poem_valid_filtered = poem_valid[
    (poem_valid['ctext'].notna()) &
    (poem_valid['text'].notna()) &
    (poem_valid['text'].str.strip() != '')
]

In [None]:
# Format training data
train_data = pd.DataFrame({
    'text': [
        *[f"summarize lyrics and capture meaning: {clean_text(text)}" for text in train_lyrics_filtered['Lyrics']],
        *[f"summarize poem and capture meaning: {clean_text(text)}" for text in poem_train_filtered['ctext']]
    ],
    'summary': [
        *[clean_text(text) for text in train_lyrics_filtered['Combined Annotations']],
        *[clean_text(text) for text in poem_train_filtered['text']]
    ]
})

In [None]:
# Format validation data
val_data = pd.DataFrame({
    'text': [
        *[f"summarize lyrics and capture meaning: {clean_text(text)}" for text in val_lyrics_filtered['Lyrics']],
        *[f"summarize poem and capture meaning: {clean_text(text)}" for text in poem_valid_filtered['ctext']]
    ],
    'summary': [
        *[clean_text(text) for text in val_lyrics_filtered['Combined Annotations']],
        *[clean_text(text) for text in poem_valid_filtered['text']]
    ]
})

# Ensure data types are strings
train_data['text'] = train_data['text'].astype(str)
train_data['summary'] = train_data['summary'].astype(str)
val_data['text'] = val_data['text'].astype(str)
val_data['summary'] = val_data['summary'].astype(str)

### Old data preparation code

In [None]:
# Train on all artists

# Initialize an empty list to store DataFrames
df_list = []

folder_path = "/content/drive/My Drive/266 Final Project/Song Files"
# Iterate through each file in the directory
for filename in os.listdir(folder_path):
    # Check if the file is a CSV file
    if filename.endswith('.csv'):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        # Read the CSV file and append it to the list
        df = pd.read_csv(file_path)
        df_list.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
df = pd.concat(df_list, ignore_index=True)

In [None]:
# # be wary of extent of cleaning on text

# def clean_text(text):
#     # Replace multiple spaces with a single space
#     text = re.sub(r'\s+', ' ', text)
#     # Remove incomplete sentences
#     text = re.sub(r'(\s*—\s*)', '', text)
#     text = re.sub(r'\s+[,\.]', '', text)
#     text = re.sub(r'[""]', '"', text)
#     text = re.sub(r'[^\w\s.,!?]', '', text)
#     text = re.sub(r'\s{2,}', ' ', text)
#     text = re.sub(r'\s{2,}', '[missing]', text)
#     return text.strip()


# print("Original number of songs", len(df))

# # If there are no wiki annotations let's drop them
# df = df[df['Wikipedia Annotation'] != "No Wikipedia annotation found (artist name not mentioned)"]

# print("After dropping no annotations", len(df))

# # Let's clean up the text a bit
# df['Lyrics'] = df['Lyrics'].apply(clean_text)
# df['generated_annotation'] = df['generated_annotation'].apply(clean_text)

# # Calculate average string length of the column
# # average_length = df['Wikipedia Annotation'].str.len().mean()
# # min_length = df['Wikipedia Annotation'].str.len().min()
# # max_length = df['Wikipedia Annotation'].str.len().max()

# # Display the result
# # print("Average string length of wiki:", average_length)
# # print("Min string length of wiki:", min_length)
# # print("Max string length of wiki:", max_length)

# # Keep only the columns we need
# # df = df[['Title', 'Lyrics', 'Wikipedia Annotation', 'Combined Annotations']]

# print(df.head())

In [None]:
# Calculate average string length of the column
average_length = df['Lyrics'].str.len().mean()
min_length = df['Lyrics'].str.len().min()
max_length = df['Lyrics'].str.len().max()

# Display the result
print("Average string length of lyrics:", average_length)
print("Min string length of lyrics:", min_length)
print("Max string length of lyrics:", max_length)

Average string length of lyrics: 1893.4503688799464
Min string length of lyrics: 12
Max string length of lyrics: 6423


In [None]:
poem_list = []
poem_test = pd.read_csv("/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_test.csv")
poem_list.append(poem_test)
poem_train = pd.read_csv("/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_train.csv")
poem_list.append(poem_train)
poem_valid = pd.read_csv("/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_valid.csv")
poem_list.append(poem_valid)

# Concatenate all DataFrames in the list into a single DataFrame
poem_data = pd.concat(poem_list, ignore_index=True)

In [None]:
# Do this before calling create_baseline_model
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
#do it for poetry data
train_val_poem, test_poem = train_test_split(poem_data, test_size=0.2, random_state=42)

# Create and Train t5 model

In [None]:
def create_baseline_model(train_data, val_data, save_dir="checkpoints"):
    """Create and train model with combined data sources"""
    print("Initializing model and tokenizer...")
    MODEL_NAME = 't5-small'
    tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
    model = LyricsSummaryModel(MODEL_NAME)

    print("Setting up data module...")
    data_module = LyricsSummaryDataModule(
        train_df=train_data,
        val_df=val_data,
        tokenizer=tokenizer,
        batch_size=4,
        text_max_token_len=1024,
        summary_max_token_len=256
    )

    print("Configuring trainer...")
    trainer = pl.Trainer(
        max_epochs=3,
        accumulate_grad_batches=4,
        gradient_clip_val=1.0,
        precision=16 if torch.cuda.is_available() else 32,
        enable_checkpointing=True,
        default_root_dir=save_dir,
        callbacks=[
            pl.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=3,
                mode='min'
            )
        ]
    )

    print("Starting training...")
    trainer.fit(model, data_module)

    print("Saving model and tokenizer...")
    drive_path = '/content/drive/My Drive/266 Final Project/Our Models/Combined Data'
    os.makedirs(drive_path, exist_ok=True)
    try:
        model.model.save_pretrained(drive_path)
        tokenizer.save_pretrained(drive_path)
        print(f"Model and tokenizer successfully saved to {drive_path}")
    except Exception as e:
        print(f"Failed to save model and tokenizer: {e}")

    return model, tokenizer, trainer

In [None]:
model, tokenizer, trainer = create_baseline_model(train_data, val_data)

Initializing model and tokenizer...


/usr/local/lib/python3.10/dist-packages/lightning_fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M | eval
------------------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estim

Setting up data module...
Configuring trainer...
Starting training...


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


Saving model and tokenizer...
Model and tokenizer successfully saved to /content/drive/My Drive/266 Final Project/Our Models/Combined Data


## (SKIP) Old approach to baseline model (before data pre-processing changes)

In [None]:
def create_baseline_model(df, poem_df, save_dir="checkpoints"):
    # 1. Initialize model and tokenizer
    print("Initializing model and tokenizer...")
    MODEL_NAME = 't5-small'
    tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
    model = LyricsSummaryModel(MODEL_NAME)

    # 2. Prepare song lyrics data
    print("Preparing data...")

    # Handle missing or invalid data in Combined Annotations column
    df['Combined Annotations'] = df['Combined Annotations'].astype(str)
    df['Combined Annotations'] = df['Combined Annotations'].fillna('')

    lyrics_data = pd.DataFrame({
        'text': train_val_df.apply(
            lambda x: f"summarize lyrics and capture meaning: {x['Lyrics']}",
            axis=1
        ),
        'summary': train_val_df['Combined Annotations'].apply(
            lambda x: f"Meaning and themes: {' '.join(x.split()[:100])}"
        )
    })

    # 3. Prepare poem data
    poem_data = pd.DataFrame({
        'text': train_val_poem.apply(
            lambda x: f"summarize poem and capture meaning: {x['ctext']}",
            axis=1
        ),
        'summary': train_val_poem['text'].apply(
            lambda x: f"Meaning and themes: {x}"
        )
    })

    # 4. Combine lyrics and poem data
    combined_data = pd.concat([lyrics_data, poem_data], ignore_index=True)

    # Clean combined data
    combined_data['text'] = combined_data['text'].astype(str)
    combined_data['summary'] = combined_data['summary'].astype(str)

    # Remove any rows with empty text or summaries
    combined_data = combined_data[combined_data['text'].str.len() > 10]
    combined_data = combined_data[combined_data['summary'].str.len() > 10]

    # Shuffle the combined dataset
    combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

    # 5. Split data
    train_size = int(0.8 * len(combined_data))
    train_data = combined_data[:train_size]
    val_data = combined_data[train_size:]

    # Print dataset statistics
    print(f"\nDataset Statistics:")
    print(f"Total samples: {len(combined_data)}")
    print(f"Training samples: {len(train_data)}")
    print(f"Validation samples: {len(val_data)}")

    # 6. Set up data module
    data_module = LyricsSummaryDataModule(
        train_df=train_data,
        val_df=val_data,
        tokenizer=tokenizer,
        batch_size=2
    )

    # 7. Set up trainer
    trainer = pl.Trainer(
        max_epochs=1,
        accumulate_grad_batches=2,
        gradient_clip_val=1.0,
        precision=16 if torch.cuda.is_available() else 32,
        enable_checkpointing=True,
        default_root_dir=save_dir
    )

    # 8. Train model
    print("Starting training...")
    trainer.fit(model, data_module)

    # 9. Save model and tokenizer
    print("Saving model and tokenizer...")
    drive_path = '/content/drive/MyDrive/266 Final Project/Our Models/All Data'
    os.makedirs(drive_path, exist_ok=True)

    try:
        model.model.save_pretrained(drive_path)
        tokenizer.save_pretrained(drive_path)
        print(f"Model and tokenizer successfully saved to {drive_path}")
    except Exception as e:
        print(f"Failed to save model and tokenizer: {e}")

    return model, tokenizer, trainer


In [None]:
model, tokenizer, trainer = create_baseline_model(train_val_df, train_val_poem)

Initializing model and tokenizer...
Preparing data...

Dataset Statistics:
Total samples: 4793
Training samples: 3834
Validation samples: 959


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Starting training...


INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode
------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M | eval
------------------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)
0         Modules in train mode
277       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
INFO:pytorch_lightning.utilities.rank_zero:
Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [None]:
print(type(model.model))
# Output should be <class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>


<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>


# (SKIP) Generate Song Summary

In [None]:
def generate_song_summary(model, tokenizer, data, song_index, max_length=150):
    """Generate a summary for a single song"""

    #training input format
    input_text = f"summarize lyrics and capture meaning: {data.iloc[song_index]['Lyrics']}"


    # Encode the text
    inputs = tokenizer.encode(
        input_text,
        max_length=5000,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # Generate summary
    # The 'generate' method should be called directly on the 'model' object
    summary_ids = model.generate( # Removed 'model.' before generate
        inputs,
        max_length=300,
        min_length=100,
        num_beams=5,
        #temperature=0.9,
        length_penalty=0.5,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    # Decode summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

- Version with lyrics only
- How much does it improve when we add Wiki data?
- Try experimenting with .generate parameters
- Three approaches
  - Lyrics
  - Lyrics + Wiki

- Goal involves summarization and commentary
  - need label data to reflect goal
  - challenge is data formatting in available label data
  - does using Wiki improve output
  - real data is slightly different from our goal
  - the model will train on this data, so won't learn our exact goal
  - framing challenge


In [None]:
# Usage example:
"""
generate_song_summary(model, tokenizer, df, song_index=0)
"""

summary = generate_song_summary(model, tokenizer, df, song_index=1)

print(df.iloc[1]['Title'])
print_summary(summary)


Bahamas Promises
Meaning and themes: “Dods, Man Yeah, For All The Dogs” is a song that
focuses on the theme theme of the song. The song was titled “Fast
Pinky promises” by the singer-songwriter. It was released on November
1, 2015, and has been re-released on October 1, 2015. The track is now
available on iTunes and is available for download in the iTunes Store.
This is the first track to be released.


In [None]:
print_summary(df.iloc[1]['Combined Annotations'])


This song unfolds as a poignant reflection on a troubled relationship.
The song’s title immediately conjures images of unfulfilled dreams and
broken commitments. As the track progresses, it becomes evident that
Drake, is addressing a woman named Hailey.  The song’s opening lines,
“Hailey, it’s sad that I know all the tea,” set a tone of
disappointment and disillusionment. Drake references “broken pinky
promises” and recounts how a trip to the Bahamas was marred by
Hailey’s actions. There’s a palpable sense of betrayal and a
realization that the relationship isn’t working.  Throughout the song,
the theme of disappointment and broken trust resurfaces. Drake
expresses weariness with Hailey’s apologies, indicating that he’s
reached a breaking point. The wordplay involving “No” in monogamy
suggests that the relationship has been marked by infidelity or a lack
of commitment.  As the song unfolds, the emotional weight becomes more
pronounced. Drake laments that Hailey lives in his mind, rent-

#Evaluation

In [None]:
# Check if model path exists and contents
import os
print(f"Model path exists: {os.path.exists(model_path)}")
print("Contents:", os.listdir('/content/drive/My Drive/266 Final Project/Our Models/'))

# Check test_lyrics structure
print("\nTest data info:")
print(test_lyrics.info())

# Clean test_lyrics if needed
test_lyrics = test_lyrics.dropna(subset=['Lyrics', 'Combined Annotations'])
test_lyrics['Lyrics'] = test_lyrics['Lyrics'].astype(str)
test_lyrics['Combined Annotations'] = test_lyrics['Combined Annotations'].astype(str)

Model path exists: False
Contents: ['lyrics_model_full', 'Lyrics + Genius', 'BART_All_Data', 'Lyrics + Poem Data', '.ipynb_checkpoints', 'Pegasus [OLD]', 'Combined Data (T5 All Data)', 'Archive', 'Pegasus', 'T5 Fine Tuned Model', 'BART Fine Tuned']

Test data info:
<class 'pandas.core.frame.DataFrame'>
Index: 638 entries, 1029 to 1356
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Song ID               635 non-null    object
 1   Title                 629 non-null    object
 2   Lyrics URL            626 non-null    object
 3   Combined Annotations  629 non-null    object
 4   Wikipedia Annotation  629 non-null    object
 5   Lyrics                629 non-null    object
 6   generated_annotation  3 non-null      object
dtypes: object(7)
memory usage: 39.9+ KB
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_lyrics['Lyrics'] = test_lyrics['Lyrics'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_lyrics['Combined Annotations'] = test_lyrics['Combined Annotations'].astype(str)


In [None]:
# Load saved model
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = '/content/drive/MyDrive/266 Final Project/Our Models/Combined Data (T5 All Data)'
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

print("Model and tokenizer loaded successfully!")

# We already have our test_lyrics from data preparation
print(f"Test set size: {len(test_lyrics)}")
print(test_lyrics.columns.tolist())

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Model and tokenizer loaded successfully!
Test set size: 629
['Song ID', 'Title', 'Lyrics URL', 'Combined Annotations', 'Wikipedia Annotation', 'Lyrics', 'generated_annotation']


In [None]:
try:
    print("Loading model...")
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    print("Loading tokenizer...")
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    print("Model and tokenizer loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    # Check contents of the model directory
    print("\nModel directory contents:")
    print(os.listdir(model_path))

Loading model...
Loading tokenizer...
Model and tokenizer loaded successfully!


In [None]:
# Install required package
!pip install bert-score
from bert_score import score
import torch
from sklearn.model_selection import train_test_split
!pip install rouge-score # rouge-score is the correct package name, not rouge_score.

import torch
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
from transformers import T5Tokenizer
from rouge_score import rouge_scorer
from bert_score import score
from tqdm import tqdm

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=2581f86b426b28e173e2cbd5a57e8476d1bd3e35b0013b154fac9cc7d4328395
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
def evaluate_combined_model(
    model: T5ForConditionalGeneration,
    tokenizer: T5Tokenizer,
    test_data: pd.DataFrame,
    batch_size: int = 16
) -> Tuple[Dict[str, float], List[Dict]]:
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    evaluation_results = {
        'content_coverage': [],
        'semantic_similarity': [],
        'rouge1': [],
        'rouge2': [],
        'rougeL': [],
        'bert_score': []
    }

    examples = []
    previous_bert_score = 0.0

    for idx in tqdm(range(0, len(test_data), batch_size)):
        batch_lyrics = test_data['Lyrics'].iloc[idx:idx + batch_size].tolist()
        batch_annotations = test_data['Combined Annotations'].iloc[idx:idx + batch_size].tolist()

        inputs = tokenizer(
            [f"summarize lyrics and capture meaning: {lyric}" for lyric in batch_lyrics],
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=150,
                min_length=50,
                num_beams=4,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                no_repeat_ngram_size=3,
                length_penalty=1.0,
                repetition_penalty=1.2
            )

            generated_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for i in range(len(generated_summaries)):
            original_lyric = batch_lyrics[i]
            generated_summary = generated_summaries[i]
            reference_annotation = batch_annotations[i]

            metrics = calculate_metrics(
                original_lyric,
                generated_summary,
                reference_annotation
            )

            for key, value in metrics.items():
                evaluation_results[key].append(value)

            if len(examples) < 5:
                examples.append({
                    'lyrics': original_lyric,
                    'generated_summary': generated_summary,
                    'metrics': metrics
                })

        if idx % 5 == 0:
            torch.cuda.empty_cache()

    final_metrics = {
        'avg_content_coverage': np.mean(evaluation_results['content_coverage']),
        'avg_semantic_similarity': np.mean(evaluation_results['semantic_similarity']),
        'avg_rouge1': np.mean(evaluation_results['rouge1']),
        'avg_rouge2': np.mean(evaluation_results['rouge2']),
        'avg_rougeL': np.mean(evaluation_results['rougeL']),
        'avg_bert_score': np.mean(evaluation_results['bert_score'])
    }

    return final_metrics, examples

def calculate_metrics(lyrics: str, summary: str, annotation: str) -> Dict[str, float]:
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    try:
        rouge_scores = rouge_scorer_obj.score(summary, annotation)
    except KeyError as e:
        print(f"Error calculating ROUGE scores: {e}")
        rouge_scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

    # BERTScore
    P, R, F1 = score([summary], [annotation], lang='en', verbose=False)

    return {
        'content_coverage': calculate_content_coverage(lyrics, summary),
        'semantic_similarity': calculate_semantic_similarity(lyrics, summary),
        'rouge1': rouge_scores.get('rouge1', 0.0).fmeasure,
        'rouge2': rouge_scores.get('rouge2', 0.0).fmeasure,
        'rougeL': rouge_scores.get('rougeL', 0.0).fmeasure,
        'bert_score': F1.mean().item()
    }

def calculate_content_coverage(lyrics: str, summary: str) -> float:
   """Calculate content coverage between lyrics and summary"""
   # Handle NaN or float values
   if isinstance(lyrics, float) or isinstance(summary, float):
       return 0.0

   try:
       lyrics_tokens = set(str(lyrics).lower().split())
       summary_tokens = set(str(summary).lower().split())
       overlap = len(lyrics_tokens.intersection(summary_tokens))
       coverage = overlap / len(lyrics_tokens) if lyrics_tokens else 0.0
       return coverage
   except Exception as e:
       print(f"Error processing lyrics/summary: {e}")
       return 0.0

def calculate_semantic_similarity(lyrics: str, summary: str) -> float:
   """Calculate semantic similarity using token overlap"""
   # Handle NaN or float values
   if isinstance(lyrics, float) or isinstance(summary, float):
       return 0.0

   try:
       lyrics_tokens = set(str(lyrics).lower().split())
       summary_tokens = set(str(summary).lower().split())
       intersection = len(lyrics_tokens.intersection(summary_tokens))
       union = len(lyrics_tokens.union(summary_tokens))
       return intersection / union if union > 0 else 0.0
   except Exception as e:
       print(f"Error processing lyrics/summary: {e}")
       return 0.0

def print_evaluation_results(metrics: Dict[str, float], examples: List[Dict]):
    print("\nEvaluation Results:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.3f}")

    print("\nExample Generations:")
    for i, example in enumerate(examples, 1):
        print(f"\nExample {i}:")
        print(f"Original Lyrics (truncated): {example['lyrics'][:200]}...")
        print(f"\nGenerated Summary: {example['generated_summary']}")
        print("\nMetrics:")
        for metric, value in example['metrics'].items():
            print(f"{metric}: {value:.3f}")


In [None]:
test_lyrics.head()

Unnamed: 0,Song ID,Title,Lyrics URL,Combined Annotations,Wikipedia Annotation,Lyrics,generated_annotation
1029,328891,Someone for Me,https://genius.com/Whitney-houston-someone-for...,“Someone for Me” is the third track from Whitn...,No Wikipedia annotation found (artist name not...,(Someone for me) \n(Someone for me) \nI'm here...,
1001,141615,Saving All My Love for You,https://genius.com/Whitney-houston-saving-all-...,“Saving All My Love for You” is a song written...,"""Saving All My Love for You"" is a song written...",A few stolen moments is all that we share\r\nY...,
785,8827806,What It Is (Block Boy),https://genius.com/Doechii-what-it-is-block-bo...,?,What It Is Block Boy is a song by American rap...,"What it is, ho? Whats up? Every good girl need...",
411,3037193,I’m Good (Blue),https://genius.com/David-guetta-and-bebe-rexha...,“I’m Good (Blue)” is a song by David Guetta an...,Im Good Blue is a song by French DJ and produc...,"Im good, yeah, Im feelin alright Baby, Ima hav...",
1105,1342410,The Way You Are,https://genius.com/Tears-for-fears-the-way-you...,“The Way You Are” was the first Tears for Fear...,"The Way You Are may refer to:\n\n""The Way You ...","Going far, getting nowhere\r\nGoing far, the w...",


In [None]:
#find unique urls in test_lyrics
test_lyrics['Lyrics URL'].nunique()

#print all test urls
test_lyrics['Lyrics URL'].unique()


array(['https://genius.com/Whitney-houston-someone-for-me-lyrics',
       'https://genius.com/Whitney-houston-saving-all-my-love-for-you-lyrics',
       'https://genius.com/Doechii-what-it-is-block-boy-lyrics',
       'https://genius.com/David-guetta-and-bebe-rexha-im-good-blue-lyrics',
       'https://genius.com/Tears-for-fears-the-way-you-are-lyrics',
       'https://genius.com/Shakira-que-me-quedes-tu-lyrics',
       'https://genius.com/Shakira-dare-la-la-la-lyrics',
       'https://genius.com/Samia-amelia-lyrics',
       'https://genius.com/Travis-scott-90210-lyrics',
       'https://genius.com/Hippo-campus-everything-at-once-lyrics',
       'https://genius.com/Green-day-basket-case-4-track-demo-lyrics',
       'https://genius.com/The-beach-boys-god-only-knows-master-track-mix-with-a-cappella-tag-lyrics',
       'https://genius.com/David-guetta-little-bad-girl-lyrics',
       'https://genius.com/Maroon-5-and-megan-thee-stallion-beautiful-mistakes-lyrics',
       'https://genius.com

In [None]:

#find song id for begin again - https://genius.com/Taylor-swift-bigger-than-the-whole-sky-lyrics'
test_lyrics[test_lyrics['Lyrics URL'] == 'https://genius.com/Whitney-houston-saving-all-my-love-for-you-lyrics']



Unnamed: 0,Song ID,Title,Lyrics URL,Combined Annotations,Wikipedia Annotation,Lyrics,generated_annotation
1001,141615,Saving All My Love for You,https://genius.com/Whitney-houston-saving-all-...,“Saving All My Love for You” is a song written...,"""Saving All My Love for You"" is a song written...",A few stolen moments is all that we share\r\nY...,


In [None]:
#run model eval on example song from test df
test_lyrics[test_lyrics['Song ID'] == 3244194]
metrics, examples = evaluate_combined_model(model, tokenizer, test_lyrics[test_lyrics['Song ID'] == 96432])
print_evaluation_results(metrics, examples)

  0%|          | 0/1 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1/1 [00:03<00:00,  3.86s/it]


Evaluation Results:
avg_content_coverage: 0.078
avg_semantic_similarity: 0.072
avg_rouge1: 0.242
avg_rouge2: 0.039
avg_rougeL: 0.166
avg_bert_score: 0.832

Example Generations:

Example 1:
Original Lyrics (truncated): Took a deep breath in the mirror He didnt like it when I wore high heels But I do Turn the lock and put my headphones on He always said he didnt get this song But I do, I do Walked in expecting youd b...

Generated Summary: 'Standard' is a song about a girl's life in a cafe. It's the first time the song has been released. The song is based on a stanzas of the song.

Metrics:
content_coverage: 0.078
semantic_similarity: 0.072
rouge1: 0.242
rouge2: 0.039
rougeL: 0.166
bert_score: 0.832





In [None]:
metrics, examples = evaluate_combined_model(model, tokenizer, test_lyrics)
print_evaluation_results(metrics, examples)

  0%|          | 0/40 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at robert


Evaluation Results:
avg_content_coverage: 0.129
avg_semantic_similarity: 0.106
avg_rouge1: 0.215
avg_rouge2: 0.047
avg_rougeL: 0.141
avg_bert_score: 0.833

Example Generations:

Example 1:
Original Lyrics (truncated): (Someone for me) 
(Someone for me) 
I'm here alone on a Friday night 
Waiting here beside the phone 
The TV, radio, and me 
Really ain't been getting along 

I wish that I could find a way 

To party ...

Generated Summary: 'Songs' is one of the most popular songs of the year. The song is based on the lyrics of the song. It is a song about a young woman who loves to dance with a man who is not a fan of music, but a person who has a passion for music.

Metrics:
content_coverage: 0.102
semantic_similarity: 0.086
rouge1: 0.159
rouge2: 0.000
rougeL: 0.091
bert_score: 0.836

Example 2:
Original Lyrics (truncated): A few stolen moments is all that we share
You've got your family and they need you there. 
Though I try to resist being last on your list. 
But no other man's gonn




# Old way


In [None]:
import torch
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
from transformers import T5Tokenizer
from rouge_score import rouge_scorer
from bert_score import score
from tqdm import tqdm

def evaluate_self_supervised_model(
    model: LyricsSummaryModel,
    tokenizer: T5Tokenizer,
    test_data: pd.DataFrame,
    batch_size: int = 8
) -> Tuple[Dict[str, float], List[Dict]]:
    """
    Evaluate self-supervised lyrics model with the following metrics:
    1. Consistency of outputs
    2. Coverage of key lyrics content
    3. Semantic similarity within lyrics context
    4. BERTScore for semantic evaluation
    """
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    evaluation_results = {
        'content_coverage': [],
        'consistency_score': [],
        'semantic_similarity': [],
        'bert_score': []
    }

    examples = []

    for idx in tqdm(range(0, len(test_data), batch_size)):
        batch_lyrics = test_data['Lyrics'].iloc[idx:idx + batch_size].tolist()

        # Generate multiple summaries for each lyric to test consistency
        summaries_per_lyric = []
        for _ in range(3):  # Generate 3 summaries per lyric
            inputs = tokenizer(
                [f"summarize lyrics: {lyric}" for lyric in batch_lyrics],
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt"
            ).to(device)

            with torch.no_grad():
                outputs = model.model.generate(
                    input_ids=inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],
                    max_length=100,
                    min_length=30,
                    num_beams=4,
                    do_sample=True,
                    temperature=0.3,
                    top_k=50,
                    no_repeat_ngram_size=3,
                    length_penalty=0.8,
                    repetition_penalty=1.5
                )

                decoded_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                summaries_per_lyric.append(decoded_summaries)

        # Evaluate each lyric's summaries
        for lyric_idx in range(len(batch_lyrics)):
            original_lyric = batch_lyrics[lyric_idx]
            lyric_summaries = [summaries[lyric_idx] for summaries in summaries_per_lyric]

            # 1. Content Coverage Score
            coverage_score = calculate_content_coverage(original_lyric, lyric_summaries[0])
            evaluation_results['content_coverage'].append(coverage_score)

            # 2. Consistency Score across multiple generations
            consistency_score = calculate_consistency_score(lyric_summaries)
            evaluation_results['consistency_score'].append(consistency_score)

            # 3. Semantic Similarity
            semantic_score = calculate_semantic_similarity(original_lyric, lyric_summaries[0])
            evaluation_results['semantic_similarity'].append(semantic_score)

            # 4. BERTScore
            P, R, F1 = score(
                [lyric_summaries[0]],
                [original_lyric],
                model_type="microsoft/deberta-xlarge-mnli",
                device=device
            )
            evaluation_results['bert_score'].append(F1.mean().item())

            # Store examples
            if len(examples) < 5:  # Store first 5 examples
                examples.append({
                    'lyrics': original_lyric,
                    'generated_summaries': lyric_summaries,
                    'metrics': {
                        'content_coverage': coverage_score,
                        'consistency': consistency_score,
                        'semantic_similarity': semantic_score,
                        'bert_score': F1.mean().item()
                    }
                })

        torch.cuda.empty_cache()

    # Aggregate results
    metrics = {
        'avg_content_coverage': np.mean(evaluation_results['content_coverage']),
        'avg_consistency': np.mean(evaluation_results['consistency_score']),
        'avg_semantic_similarity': np.mean(evaluation_results['semantic_similarity']),
        'avg_bert_score': np.mean(evaluation_results['bert_score'])
    }

    return metrics, examples


def calculate_content_coverage(lyrics: str, summary: str) -> float:
    lyrics_tokens = set(lyrics.lower().split())
    summary_tokens = set(summary.lower().split())
    overlap = len(lyrics_tokens.intersection(summary_tokens))
    coverage = overlap / len(lyrics_tokens)
    return coverage


def calculate_consistency_score(summaries: List[str]) -> float:
    if len(summaries) < 2:
        return 1.0

    rouge_scorer_obj = rouge_scorer.RougeScorer(
        ['rouge1', 'rouge2', 'rougeL'],
        use_stemmer=True
    )

    scores = []
    for i in range(len(summaries)):
        for j in range(i + 1, len(summaries)):
            score = rouge_scorer_obj.score(summaries[i], summaries[j])
            avg_score = (
                score['rouge1'].fmeasure +
                score['rouge2'].fmeasure +
                score['rougeL'].fmeasure
            ) / 3
            scores.append(avg_score)

    return np.mean(scores)


def calculate_semantic_similarity(lyrics: str, summary: str) -> float:
    lyrics_tokens = set(lyrics.lower().split())
    summary_tokens = set(summary.lower().split())
    intersection = len(lyrics_tokens.intersection(summary_tokens))
    union = len(lyrics_tokens.union(summary_tokens))
    return intersection / union if union > 0 else 0.0


def print_evaluation_results(metrics: Dict[str, float], examples: List[Dict]):
    print("\nEvaluation Results:")
    print(f"Average Content Coverage: {metrics['avg_content_coverage']:.3f}")
    print(f"Average Consistency: {metrics['avg_consistency']:.3f}")
    print(f"Average Semantic Similarity: {metrics['avg_semantic_similarity']:.3f}")
    print(f"Average BERTScore: {metrics['avg_bert_score']:.3f}")

    print("\nExample Generations:")
    for i, example in enumerate(examples, 1):
        print(f"\nExample {i}:")
        print(f"Original Lyrics (truncated): {example['lyrics'][:200]}...")
        print("\nGenerated Summaries:")
        for j, summary in enumerate(example['generated_summaries'], 1):
            print(f"{j}. {summary}")
        print("\nMetrics:")
        for metric, value in example['metrics'].items():
            print(f"{metric}: {value:.3f}")


# Usage example
metrics, examples = evaluate_self_supervised_model(
    model,
    tokenizer,
    test_data=test_df,
    batch_size=8
)

print_evaluation_results(metrics, examples)



  0%|          | 0/75 [00:00<?, ?it/s][A

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.04G [00:00<?, ?B/s]


  1%|▏         | 1/75 [10:12<12:35:37, 612.67s/it][A
  3%|▎         | 2/75 [18:38<11:08:54, 549.79s/it][A
  4%|▍         | 3/75 [24:56<9:25:33, 471.30s/it] [A
  5%|▌         | 4/75 [33:27<9:36:25, 487.12s/it][A
  7%|▋         | 5/75 [43:16<10:11:04, 523.77s/it][A
  8%|▊         | 6/75 [53:02<10:26:29, 544.77s/it][A
  9%|▉         | 7/75 [1:01:58<10:14:12, 541.95s/it][A
 11%|█         | 8/75 [1:11:18<10:11:42, 547.80s/it][A
 12%|█▏        | 9/75 [1:17:29<9:01:47, 492.53s/it] [A
 13%|█▎        | 10/75 [1:27:37<9:32:20, 528.31s/it][A
 15%|█▍        | 11/75 [1:39:21<10:20:52, 582.07s/it][A
 16%|█▌        | 12/75 [1:46:23<9:20:04, 533.40s/it] [A
 17%|█▋        | 13/75 [1:55:44<9:19:42, 541.65s/it][A
 19%|█▊        | 14/75 [2:03:07<8:40:19, 511.80s/it][A
 20%|██        | 15/75 [2:12:26<8:46:09, 526.17s/it][A
 21%|██▏       | 16/75 [2:21:56<8:50:10, 539.16s/it][A