In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import numpy as np
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
class ParaphraseTripletDataset(Dataset):
    def __init__(self, triplets, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.triplets = triplets
        self.max_length = max_length

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):
        original, positive, negative = self.triplets[idx]
        return {
            'original': self.tokenizer(original, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length'),
            'positive': self.tokenizer(positive, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length'),
            'negative': self.tokenizer(negative, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length'),
        }

In [None]:
model_name = 't5-small'  # You can choose a different model based on your needs
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def train_model_with_influence(model, data_loader, optimizer, epochs=3, device='cuda'):
    model.to(device)
    model.train()
    for epoch in range(epochs):
        for batch in data_loader:
            optimizer.zero_grad()
            rand_signal = random.choice(['positive', 'negative'])

            inputs = batch['original']['input_ids'].squeeze().to(device)  # Ensure correct shape
            attention_mask = batch['original']['attention_mask'].squeeze().to(device)  # Ensure correct shape
            if rand_signal == 'positive':
                targets = batch['positive']['input_ids'].squeeze().to(device)
            else:
                targets = batch['negative']['input_ids'].squeeze().to(device)

            labels = targets.clone()
            labels[labels == tokenizer.pad_token_id] = -100

            # Ensure labels are correctly shaped
            labels = labels.squeeze()  # Removing unnecessary dimensions if any

            try:
                outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                print(f"Epoch {epoch}, Loss: {loss.item()}, Chosen: {rand_signal}")
            except ValueError as e:
                print(f"Error during training: {str(e)}")
                # Inspect shapes if error occurs
                print(f"Input IDs shape: {inputs.shape}, Attention Mask shape: {attention_mask.shape}, Labels shape: {labels.shape}")


In [None]:
import pandas as pd

folder_input_path = '/content/drive/My Drive/Colab Notebooks/Ch3_Corpora/'
csv_file_path = 'sentiment-paraphrase-corpus-full-v1.csv'

# Read the CSV file
df = pd.read_csv(folder_input_path + csv_file_path)

# Convert the DataFrame to a list of triplets
triplets_data = list(df[['original', 'positive', 'negative']].itertuples(index=False, name=None))

In [None]:
# Splitting the dataset, preparing DataLoader, and optimizer
train_data, val_data = train_test_split(triplets_data, test_size=0.1)
train_dataset = ParaphraseTripletDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Train the model with demonstration of influence
train_model_with_influence(model, train_loader, optimizer)

Epoch 0, Loss: 4.878786563873291, Chosen: negative
Epoch 0, Loss: 2.4864819049835205, Chosen: positive
Epoch 0, Loss: 3.241338014602661, Chosen: positive
Epoch 0, Loss: 4.3642354011535645, Chosen: negative
Epoch 0, Loss: 3.743809461593628, Chosen: positive
Epoch 0, Loss: 4.1461076736450195, Chosen: positive
Epoch 0, Loss: 3.297168254852295, Chosen: positive
Epoch 0, Loss: 3.425661325454712, Chosen: positive
Epoch 0, Loss: 3.1871325969696045, Chosen: negative
Epoch 0, Loss: 2.826861619949341, Chosen: negative
Epoch 0, Loss: 4.37708044052124, Chosen: negative
Epoch 0, Loss: 2.4768426418304443, Chosen: positive
Epoch 0, Loss: 3.691263198852539, Chosen: negative
Epoch 0, Loss: 2.8047335147857666, Chosen: positive
Epoch 0, Loss: 3.749321460723877, Chosen: positive
Epoch 0, Loss: 3.5048131942749023, Chosen: negative
Epoch 0, Loss: 3.1547048091888428, Chosen: positive
Epoch 0, Loss: 2.4334840774536133, Chosen: positive
Epoch 0, Loss: 2.5587024688720703, Chosen: positive
Epoch 0, Loss: 3.18889

In [None]:
def evaluate_model(model, data_loader, device='cuda'):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():  # No need to track gradients during evaluation
        for batch in data_loader:
            inputs = batch['original']['input_ids'].squeeze().to(device)
            attention_mask = batch['original']['attention_mask'].squeeze().to(device)
            # Choose the type of paraphrase for evaluation; let's say positive for simplicity
            targets = batch['positive']['input_ids'].squeeze().to(device)
            labels = targets.clone()
            labels[labels == tokenizer.pad_token_id] = -100
            outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
    avg_loss = total_loss / len(data_loader)
    print(f"Average loss: {avg_loss}")
    # Implement additional metrics as needed

In [None]:
val_dataset = ParaphraseTripletDataset(val_data, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=True)

evaluate_model(model, val_loader, device='cuda')

Average loss: 2.009604421528903


In [None]:
model_save_path = 'path_to_save_your_model'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

In [None]:
def generate_paraphrase(input_sentence, model, tokenizer, tone='positive', device='cuda', max_length=512):
    model.eval()
    # Incorporate the tone into the input prompt in a way the model understands
    prompt = f"{input_sentence} [Tone: {tone}]"  # This is highly model-dependent

    input_ids = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=True).to(device)
    attention_mask = (input_ids != tokenizer.pad_token_id).to(device)

    # Generate paraphrase
    output_sequences = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)

    # Decode generated sequence to text
    paraphrase = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    return paraphrase

In [None]:
input_sentence = "The meal was served promptly." #"The story was predictable."
positive_paraphrase = generate_paraphrase(input_sentence, model, tokenizer, tone='positive')
negative_paraphrase = generate_paraphrase(input_sentence, model, tokenizer, tone='negative')

print("Original:", input_sentence)
print("Positive Paraphrase:", positive_paraphrase)
print("Negative Paraphrase:", negative_paraphrase)


Original: The meal was served promptly.
Positive Paraphrase: The meal was excellent and was very good.
Negative Paraphrase: The meal was a bit disappointing and a bit disappointing.
