# T5 Fine Tuning

### This code implements a text summarization system using the T5 transformer model, specifically designed to summarize lyrics and poetry. It uses PyTorch Lightning for training infrastructure and includes several key components: a custom dataset class for handling lyrics/poetry data, a data module for managing data loading, and a model class that fine-tunes T5 for the summarization task. The system can process input text, generate summaries, and evaluate them using multiple metrics including ROUGE scores, BERT scores, content coverage, and semantic similarity. The code also includes functionality for loading different model configurations, training them with various hyperparameters (like learning rate and batch size), and saving the results. The evaluation process tests the model's ability to generate meaningful summaries of lyrics and poems, comparing them against human-written annotations to assess performance.

# Imports


In [None]:
# import needed dependencies for testing PoemSum model
!pip install pytorch-lightning transformers torch

In [None]:
#Import needed dependencies while avoiding conflicts
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer,
    AdamW
)
import re
import os
from sklearn.model_selection import train_test_split

In [None]:
import textwrap
def print_summary(text, width=70):
    print(textwrap.fill(text, width=width))


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Model Definition

In [None]:
# Custom Dataset class from PoemSum model
class LyricsSummaryDataset(Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        text_max_token_len: int = 1024,
        summary_max_token_len: int = 256
    ):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text_encoding = self.tokenizer(
            data_row["text"],
            max_length=self.text_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        summary_encoding = self.tokenizer(
            data_row["summary"],
            max_length=self.summary_max_token_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt"
        )

        labels = summary_encoding["input_ids"]
        labels[labels == 0] = -100

        return dict(
            text=data_row["text"],
            summary=data_row["summary"],
            text_input_ids=text_encoding["input_ids"].flatten(),
            text_attention_mask=text_encoding["attention_mask"].flatten(),
            labels=labels.flatten(),
            labels_attention_mask=summary_encoding["attention_mask"].flatten()
        )

# Lightning Data Module from Poem Sum
class LyricsSummaryDataModule(pl.LightningDataModule):
    def __init__(
        self,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 8,
        text_max_token_len: int = 512,
        summary_max_token_len: int = 256
    ):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def setup(self, stage=None):
        self.train_dataset = LyricsSummaryDataset(
            self.train_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )
        self.val_dataset = LyricsSummaryDataset(
            self.val_df,
            self.tokenizer,
            self.text_max_token_len,
            self.summary_max_token_len
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=2
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=2
        )

# Model Class
class LyricsSummaryModel(pl.LightningModule):
    def __init__(self, model_name='t5-small'):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(model_name, return_dict=True)

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        loss, outputs = self(
            batch["text_input_ids"],
            batch["text_attention_mask"],
            batch["labels_attention_mask"],
            batch["labels"]
        )
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, outputs = self(
            batch["text_input_ids"],
            batch["text_attention_mask"],
            batch["labels_attention_mask"],
            batch["labels"]
        )
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.00001)

# Data Processing

In [None]:
# Load lyrics data
df_list = []
folder_path = "/content/drive/My Drive/266 Final Project/Cleaned Song Files"
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        df_list.append(df)

# Concatenate lyrics data
lyrics_df = pd.concat(df_list, ignore_index=True)

In [None]:
# Load poem data keeping original splits
poem_train = pd.read_csv("/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_train.csv")
poem_valid = pd.read_csv("/content/drive/My Drive/266 Final Project/PoemSum Model/poemsum_valid.csv")

# Basic data cleaning
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [None]:
# Split lyrics data
train_lyrics, test_lyrics = train_test_split(lyrics_df, test_size=0.2, random_state=42)
train_lyrics, val_lyrics = train_test_split(train_lyrics, test_size=0.1, random_state=42)

In [None]:
# Clean and filter training data
train_lyrics_filtered = train_lyrics[
    (train_lyrics['Lyrics'].notna()) &
    (train_lyrics['Combined Annotations'].notna())
]
poem_train_filtered = poem_train[
    (poem_train['ctext'].notna()) &
    (poem_train['text'].notna()) &
    (poem_train['text'].str.strip() != '')
]

# Clean and filter validation data
val_lyrics_filtered = val_lyrics[
    (val_lyrics['Lyrics'].notna()) &
    (val_lyrics['Combined Annotations'].notna())
]
poem_valid_filtered = poem_valid[
    (poem_valid['ctext'].notna()) &
    (poem_valid['text'].notna()) &
    (poem_valid['text'].str.strip() != '')
]

In [None]:
# Format training data
train_data = pd.DataFrame({
    'text': [
        *[f"summarize lyrics and capture meaning: {clean_text(text)}" for text in train_lyrics_filtered['Lyrics']],
        *[f"summarize poem and capture meaning: {clean_text(text)}" for text in poem_train_filtered['ctext']]
    ],
    'summary': [
        *[clean_text(text) for text in train_lyrics_filtered['Combined Annotations']],
        *[clean_text(text) for text in poem_train_filtered['text']]
    ]
})

In [None]:
# Format validation data
val_data = pd.DataFrame({
    'text': [
        *[f"summarize lyrics and capture meaning: {clean_text(text)}" for text in val_lyrics_filtered['Lyrics']],
        *[f"summarize poem and capture meaning: {clean_text(text)}" for text in poem_valid_filtered['ctext']]
    ],
    'summary': [
        *[clean_text(text) for text in val_lyrics_filtered['Combined Annotations']],
        *[clean_text(text) for text in poem_valid_filtered['text']]
    ]
})

# Ensure data types are strings
train_data['text'] = train_data['text'].astype(str)
train_data['summary'] = train_data['summary'].astype(str)
val_data['text'] = val_data['text'].astype(str)
val_data['summary'] = val_data['summary'].astype(str)

# Model Training Functions

In [None]:
def create_baseline_model(train_data, val_data, save_dir="checkpoints"):
    """Create and train model with combined data sources"""
    print("Initializing model and tokenizer...")
    MODEL_NAME = 't5-small'
    tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
    model = LyricsSummaryModel(MODEL_NAME)

    print("Setting up data module...")
    data_module = LyricsSummaryDataModule(
        train_df=train_data,
        val_df=val_data,
        tokenizer=tokenizer,
        batch_size=4,
        text_max_token_len=1024,
        summary_max_token_len=256
    )

    print("Configuring trainer...")
    trainer = pl.Trainer(
        max_epochs=3,
        accumulate_grad_batches=4,
        gradient_clip_val=1.0,
        precision=16 if torch.cuda.is_available() else 32,
        enable_checkpointing=True,
        default_root_dir=save_dir,
        callbacks=[
            pl.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=3,
                mode='min'
            )
        ]
    )

    print("Starting training...")
    trainer.fit(model, data_module)

    print("Saving model and tokenizer...")
    drive_path = '/content/drive/My Drive/266 Final Project/Our Models/Combined Data'
    os.makedirs(drive_path, exist_ok=True)
    try:
        model.model.save_pretrained(drive_path)
        tokenizer.save_pretrained(drive_path)
        print(f"Model and tokenizer successfully saved to {drive_path}")
    except Exception as e:
        print(f"Failed to save model and tokenizer: {e}")

    return model, tokenizer, trainer

# Loading Model

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from typing import Dict

def load_model_and_tokenizer(base_path: str, config_name: str) -> tuple:
    """Load model and tokenizer for a specific configuration."""
    full_path = f"{base_path}/{config_name}/fine_tuned_model"
    try:
        print(f"\nLoading {config_name}...")
        tokenizer = T5TokenizerFast.from_pretrained(full_path)
        model = T5ForConditionalGeneration.from_pretrained(full_path)
        model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
        model.eval()
        print(f"Successfully loaded {config_name}")
        return model, tokenizer
    except Exception as e:
        print(f"Error loading {config_name}: {e}")
        return None, None

# Base path
base_path = "/content/drive/MyDrive/266 Final Project/Fine Tuned Model"

# Configuration names
configs = [
    "lr_1e-05_bs_2_epochs_10",
    "lr_3e-05_bs_4_epochs_5",
    "lr_5e-05_bs_2_epochs_5",
    "lr_5e-05_bs_4_epochs_10"
]

# Dictionary to store models and tokenizers
models = {}

# Load all models
for config in configs:
    model, tokenizer = load_model_and_tokenizer(base_path, config)
    if model is not None:
        models[config] = {
            'model': model,
            'tokenizer': tokenizer,
            'config': {
                'learning_rate': float(config.split('_')[1]),
                'batch_size': int(config.split('_')[3]),
                'epochs': int(config.split('_')[5])
            }
        }

print("\nLoaded models:")
for config, model_dict in models.items():
    print(f"\n{config}:")
    print(f"Learning rate: {model_dict['config']['learning_rate']}")
    print(f"Batch size: {model_dict['config']['batch_size']}")
    print(f"Epochs: {model_dict['config']['epochs']}")

# Evaluation

In [None]:
# Install required package
!pip install bert-score
from bert_score import score
import torch
from sklearn.model_selection import train_test_split
!pip install rouge-score # rouge-score is the correct package name, not rouge_score.

import torch
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple
from transformers import T5Tokenizer
from rouge_score import rouge_scorer
from bert_score import score
from tqdm import tqdm

In [None]:
def evaluate_combined_model(
    model: T5ForConditionalGeneration,
    tokenizer: T5Tokenizer,
    test_data: pd.DataFrame,
    batch_size: int = 16
) -> Tuple[Dict[str, float], List[Dict]]:
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    evaluation_results = {
        'content_coverage': [],
        'semantic_similarity': [],
        'rouge1': [],
        'rouge2': [],
        'rougeL': [],
        'bert_score': []
    }

    examples = []
    previous_bert_score = 0.0

    for idx in tqdm(range(0, len(test_data), batch_size)):
        batch_lyrics = test_data['Lyrics'].iloc[idx:idx + batch_size].tolist()
        batch_annotations = test_data['Combined Annotations'].iloc[idx:idx + batch_size].tolist()

        inputs = tokenizer(
            [f"summarize lyrics and capture meaning: {lyric}" for lyric in batch_lyrics],
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=150,
                min_length=50,
                num_beams=4,
                do_sample=True,
                temperature=0.7,
                top_k=50,
                no_repeat_ngram_size=3,
                length_penalty=1.0,
                repetition_penalty=1.2
            )

            generated_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for i in range(len(generated_summaries)):
            original_lyric = batch_lyrics[i]
            generated_summary = generated_summaries[i]
            reference_annotation = batch_annotations[i]

            metrics = calculate_metrics(
                original_lyric,
                generated_summary,
                reference_annotation
            )

            for key, value in metrics.items():
                evaluation_results[key].append(value)

            if len(examples) < 5:
                examples.append({
                    'lyrics': original_lyric,
                    'generated_summary': generated_summary,
                    'metrics': metrics
                })

        if idx % 5 == 0:
            torch.cuda.empty_cache()

    final_metrics = {
        'avg_content_coverage': np.mean(evaluation_results['content_coverage']),
        'avg_semantic_similarity': np.mean(evaluation_results['semantic_similarity']),
        'avg_rouge1': np.mean(evaluation_results['rouge1']),
        'avg_rouge2': np.mean(evaluation_results['rouge2']),
        'avg_rougeL': np.mean(evaluation_results['rougeL']),
        'avg_bert_score': np.mean(evaluation_results['bert_score'])
    }

    return final_metrics, examples

def calculate_metrics(lyrics: str, summary: str, annotation: str) -> Dict[str, float]:
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    try:
        rouge_scores = rouge_scorer_obj.score(summary, annotation)
    except KeyError as e:
        print(f"Error calculating ROUGE scores: {e}")
        rouge_scores = {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

    # BERTScore
    P, R, F1 = score([summary], [annotation], lang='en', verbose=False)

    return {
        'content_coverage': calculate_content_coverage(lyrics, summary),
        'semantic_similarity': calculate_semantic_similarity(lyrics, summary),
        'rouge1': rouge_scores.get('rouge1', 0.0).fmeasure,
        'rouge2': rouge_scores.get('rouge2', 0.0).fmeasure,
        'rougeL': rouge_scores.get('rougeL', 0.0).fmeasure,
        'bert_score': F1.mean().item()
    }

def calculate_content_coverage(lyrics: str, summary: str) -> float:
   """Calculate content coverage between lyrics and summary"""
   # Handle NaN or float values
   if isinstance(lyrics, float) or isinstance(summary, float):
       return 0.0

   try:
       lyrics_tokens = set(str(lyrics).lower().split())
       summary_tokens = set(str(summary).lower().split())
       overlap = len(lyrics_tokens.intersection(summary_tokens))
       coverage = overlap / len(lyrics_tokens) if lyrics_tokens else 0.0
       return coverage
   except Exception as e:
       print(f"Error processing lyrics/summary: {e}")
       return 0.0

def calculate_semantic_similarity(lyrics: str, summary: str) -> float:
   """Calculate semantic similarity using token overlap"""
   # Handle NaN or float values
   if isinstance(lyrics, float) or isinstance(summary, float):
       return 0.0

   try:
       lyrics_tokens = set(str(lyrics).lower().split())
       summary_tokens = set(str(summary).lower().split())
       intersection = len(lyrics_tokens.intersection(summary_tokens))
       union = len(lyrics_tokens.union(summary_tokens))
       return intersection / union if union > 0 else 0.0
   except Exception as e:
       print(f"Error processing lyrics/summary: {e}")
       return 0.0

def print_evaluation_results(metrics: Dict[str, float], examples: List[Dict]):
    print("\nEvaluation Results:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.3f}")

    print("\nExample Generations:")
    for i, example in enumerate(examples, 1):
        print(f"\nExample {i}:")
        print(f"Original Lyrics (truncated): {example['lyrics'][:200]}...")
        print(f"\nGenerated Summary: {example['generated_summary']}")
        print("\nMetrics:")
        for metric, value in example['metrics'].items():
            print(f"{metric}: {value:.3f}")


In [None]:
# Import necessary libraries
import pandas as pd

# Evaluate all models and store the results
evaluation_results = []

for config_name, model_data in models.items():
    print(f"\nEvaluating model: {config_name}")
    model = model_data['model']
    tokenizer = model_data['tokenizer']

    try:
        # Run the evaluation
        metrics, examples = evaluate_combined_model(model, tokenizer, test_data=test_lyrics)

        # Save the metrics to the results
        evaluation_results.append({
            "Config": config_name,
            "Learning Rate": model_data['config']['learning_rate'],
            "Batch Size": model_data['config']['batch_size'],
            "Epochs": model_data['config']['epochs'],
            **metrics  # Add the evaluation metrics
        })

        # Print example generations for this model
        print_evaluation_results(metrics, examples)
    except Exception as e:
        print(f"Error evaluating model {config_name}: {e}")
        evaluation_results.append({
            "Config": config_name,
            "Learning Rate": model_data['config']['learning_rate'],
            "Batch Size": model_data['config']['batch_size'],
            "Epochs": model_data['config']['epochs'],
            "Error": str(e)
        })

# Convert evaluation results to a DataFrame
results_df = pd.DataFrame(evaluation_results)

# Save results to CSV
csv_path = f"{base_path}/evaluation_results.csv"
results_df.to_csv(csv_path, index=False)
print(f"Evaluation results saved to: {csv_path}")
