https://pub.towardsai.net/how-to-do-effective-paraphrasing-using-huggingface-and-diverse-beam-search-t5-pegasus-229ca998d229

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import numpy as np
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class ParaphraseTripletDataset(Dataset):
    def __init__(self, triplets, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.triplets = triplets
        self.max_length = max_length

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):
        original, positive, negative = self.triplets[idx]
        return {
            'original': self.tokenizer(original, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length'),
            'positive': self.tokenizer(positive, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length'),
            'negative': self.tokenizer(negative, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length'),
        }

In [None]:
model_name = 'prithivida/parrot_paraphraser_on_T5'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
def train_model_with_influence(model, data_loader, optimizer, epochs=3, device='cuda'):
    model.to(device)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in data_loader:
            optimizer.zero_grad()

            # Assuming 'batch' is a tuple of (original, positive, negative)
            original, positive, negative = batch
            sentiments = ['NEUTRAL', 'POSITIVE', 'NEGATIVE']
            paraphrases = [original, positive, negative]

            for sentiment, paraphrase in zip(sentiments, paraphrases):
                # Embedding sentiment indication in the input
                input_text = f"[{sentiment}] {original}"
                inputs = tokenizer.encode_plus(input_text, return_tensors="pt", max_length=512, padding='max_length', truncation=True).to(device)
                labels = tokenizer.encode_plus(paraphrase, return_tensors="pt", max_length=512, padding='max_length', truncation=True).input_ids.to(device)
                labels[labels == tokenizer.pad_token_id] = -100  # Ignoring padding for loss calculation


                outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
                loss = outputs.loss
                loss.backward()
                total_loss += loss.item()

            optimizer.step()

        avg_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch}, Average Loss: {avg_loss}")


In [None]:
import pandas as pd

folder_input_path = '/content/drive/My Drive/Colab Notebooks/Ch3_Corpora/'
csv_file_path = 'sentiment-paraphrase-corpus-full-v1.csv'

# Read the CSV file
df = pd.read_csv(folder_input_path + csv_file_path)

# Convert the DataFrame to a list of triplets
triplets_data = list(df[['original', 'positive', 'negative']].itertuples(index=False, name=None))

In [None]:
# Splitting the dataset, preparing DataLoader, and optimizer
train_data, val_data = train_test_split(triplets_data, test_size=0.1)
train_dataset = ParaphraseTripletDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Train the model with demonstration of influence
train_model_with_influence(model, train_loader, optimizer)

Epoch 0, Average Loss: 2.299918431373264
Epoch 1, Average Loss: 0.01708533897303584
Epoch 2, Average Loss: 0.010635974244857942


In [None]:
def evaluate_model(model, data_loader, device='cuda'):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():  # No need to track gradients during evaluation
        for batch in data_loader:
            inputs = batch['original']['input_ids'].squeeze().to(device)
            attention_mask = batch['original']['attention_mask'].squeeze().to(device)
            # Choose the type of paraphrase for evaluation; let's say positive for simplicity
            targets = batch['positive']['input_ids'].squeeze().to(device)
            labels = targets.clone()
            labels[labels == tokenizer.pad_token_id] = -100
            outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
    avg_loss = total_loss / len(data_loader)
    print(f"Average loss: {avg_loss}")
    # Implement additional metrics as needed

In [None]:
val_dataset = ParaphraseTripletDataset(val_data, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=True)

evaluate_model(model, val_loader, device='cuda')

Average loss: 2.4461282166567715


In [None]:
model_save_path = 'path_to_save_your_model'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

In [None]:
def generate_paraphrase(input_sentence, model, tokenizer, tone='POSITIVE', device='cuda', num_beams=6, num_return_sequences=5, temperature=1.0, no_repeat_ngram_size=2, length_penalty=2.0, diversity_penalty=0.5):
    model.eval()
    model.to(device)
    prompt = f"paraphrase: [{tone}] {input_sentence}"
    input_ids = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=True).to(device)

    # For more creative, varied outputs
    if temperature > 1:
        # Since temperature > 1 suggests creative outputs, remove or adjust diversity_penalty as it conflicts with do_sample=True
        paraphrases = model.generate(
            input_ids,
            max_length=60,
            num_beams=num_beams,
            num_return_sequences=num_return_sequences,
            temperature=temperature,
            no_repeat_ngram_size=no_repeat_ngram_size,
            length_penalty=length_penalty,
            early_stopping=True,
            do_sample=True  # Enable sampling for temperature > 1
            # Removed num_beam_groups for compatibility with sampling
        )
    # More deterministic outputs
    else:
        # Ensure num_beams is divisible by num_beam_groups for deterministic outputs
        num_beam_groups = 2  # Example adjustment, ensure this divides num_beams evenly
        paraphrases = model.generate(
            input_ids,
            max_length=60,
            num_beams=num_beams,
            num_return_sequences=num_return_sequences,
            temperature=1.0,  # Default, no effect on beam search
            no_repeat_ngram_size=no_repeat_ngram_size,
            length_penalty=length_penalty,
            diversity_penalty=diversity_penalty,
            early_stopping=True,
            do_sample=False,  # Disable sampling for temperature = 1
            num_beam_groups=num_beam_groups  # Adjusted to ensure divisibility
        )

    return [tokenizer.decode(g, skip_special_tokens=True) for g in paraphrases]


In [None]:
input_sentence = "The story was predictable."
positive_paraphrase = generate_paraphrase(input_sentence, model, tokenizer, tone='POSITIVE', temperature=1.5)
negative_paraphrase = generate_paraphrase(input_sentence, model, tokenizer, tone='NEGATIVE', temperature=1.5)

print("Original:", input_sentence)
print("Positive Paraphrase:", positive_paraphrase)
print("Negative Paraphrase:", negative_paraphrase)


Original: The story was predictable.
Positive Paraphrase: ['Positive', 'Positive', 'Positive', 'Positive', 'Positive']
Negative Paraphrase: ['The story was predictable.', 'The story was predictable.', 'The story was predictable.', 'The story was predictable.', 'The story was predictable.']
