In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import numpy as np
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
class ParaphraseTripletDataset(Dataset):
    def __init__(self, triplets, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.triplets = triplets
        self.max_length = max_length

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):
        original, positive, negative = self.triplets[idx]
        return {
            'original': self.tokenizer(original, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length'),
            'positive': self.tokenizer(positive, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length'),
            'negative': self.tokenizer(negative, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length'),
        }

In [4]:
model_name = 't5-small'  # You can choose a different model based on your needs
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
def train_model_with_influence(model, data_loader, optimizer, epochs=3, device='cuda'):
    model.to(device)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in data_loader:
            optimizer.zero_grad()

            # Assuming 'batch' is a tuple of (original, positive, negative)
            original, positive, negative = batch
            sentiments = ['NEUTRAL', 'POSITIVE', 'NEGATIVE']
            paraphrases = [original, positive, negative]

            for sentiment, paraphrase in zip(sentiments, paraphrases):
                # Embedding sentiment indication in the input
                input_text = f"[{sentiment}] {original}"
                inputs = tokenizer.encode_plus(input_text, return_tensors="pt", max_length=512, padding='max_length', truncation=True).to(device)
                labels = tokenizer.encode_plus(paraphrase, return_tensors="pt", max_length=512, padding='max_length', truncation=True).input_ids.to(device)
                labels[labels == tokenizer.pad_token_id] = -100  # Ignoring padding for loss calculation


                outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)
                loss = outputs.loss
                loss.backward()
                total_loss += loss.item()

            optimizer.step()

        avg_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch}, Average Loss: {avg_loss}")


In [6]:
import pandas as pd

folder_input_path = '/content/drive/My Drive/Colab Notebooks/Ch3_Corpora/'
csv_file_path = 'sentiment-paraphrase-corpus-full-v1.csv'

# Read the CSV file
df = pd.read_csv(folder_input_path + csv_file_path)

# Convert the DataFrame to a list of triplets
triplets_data = list(df[['original', 'positive', 'negative']].itertuples(index=False, name=None))

In [7]:
# Splitting the dataset, preparing DataLoader, and optimizer
train_data, val_data = train_test_split(triplets_data, test_size=0.1)
train_dataset = ParaphraseTripletDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Train the model with demonstration of influence
train_model_with_influence(model, train_loader, optimizer)

Epoch 0, Average Loss: 5.564969576012931
Epoch 1, Average Loss: 0.5451627916276552
Epoch 2, Average Loss: 0.18610846221379593


In [8]:
def evaluate_model(model, data_loader, device='cuda'):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():  # No need to track gradients during evaluation
        for batch in data_loader:
            inputs = batch['original']['input_ids'].squeeze().to(device)
            attention_mask = batch['original']['attention_mask'].squeeze().to(device)
            # Choose the type of paraphrase for evaluation; let's say positive for simplicity
            targets = batch['positive']['input_ids'].squeeze().to(device)
            labels = targets.clone()
            labels[labels == tokenizer.pad_token_id] = -100
            outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
    avg_loss = total_loss / len(data_loader)
    print(f"Average loss: {avg_loss}")
    # Implement additional metrics as needed

In [9]:
val_dataset = ParaphraseTripletDataset(val_data, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=True)

evaluate_model(model, val_loader, device='cuda')

Average loss: 3.234429359436035


In [None]:
model_save_path = 'path_to_save_your_model'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

In [13]:
def generate_paraphrase(input_sentence, model, tokenizer, tone='POSITIVE', device='cuda', max_length=512):
    model.eval()
    # Adjusting the tone indication to match the training format
    # Ensure the format here matches how you've trained your model
    prompt = f"[{tone}] {input_sentence}"

    input_ids = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=True).to(device)
    attention_mask = (input_ids != tokenizer.pad_token_id).to(device)

    # Generate paraphrase with the model
    output_sequences = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)

    # Decode generated sequence to text
    paraphrase = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
    return paraphrase


In [14]:
input_sentence = "The story was predictable."
positive_paraphrase = generate_paraphrase(input_sentence, model, tokenizer, tone='POSITIVE')
negative_paraphrase = generate_paraphrase(input_sentence, model, tokenizer, tone='NEGATIVE')

print("Original:", input_sentence)
print("Positive Paraphrase:", positive_paraphrase)
print("Negative Paraphrase:", negative_paraphrase)


Original: The story was predictable.
Positive Paraphrase: Die Geschichte war vorherse.
Negative Paraphrase: negative
