In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
dataset = 'manual' # 'mrpc','manual'
sentiment = 'negative' # 'positive', 'negative'

folder_input_path = '/content/drive/My Drive/Colab Notebooks/5_Corpora/corpora/'
folder_pretrained_path = '/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/'
csv_file_path = f'{dataset}-triplet-corpus.csv'

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.auto import tqdm

# Load the dataset
df = pd.read_csv(folder_input_path + csv_file_path)
# Assume the dataset has columns 'original' for input text and 'positive' or 'negative' for target text
input_texts = df['original'].tolist()
target_texts = df['negative'].tolist()

# Split the dataset into training and validation sets
train_inputs, val_inputs, train_targets, val_targets = train_test_split(input_texts, target_texts, test_size=0.1)

class TextDataset(Dataset):
    def __init__(self, tokenizer, inputs, targets, max_token_len=512, prefix="paraphrase: "):
        self.tokenizer = tokenizer
        self.inputs = [prefix + text for text in inputs]  # Prepend prefix
        self.targets = targets
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_text = self.inputs[index]
        target_text = self.targets[index]

        input_tokens = self.tokenizer.encode_plus(input_text, max_length=self.max_token_len, truncation=True, padding="max_length", return_tensors="pt")
        target_tokens = self.tokenizer.encode_plus(target_text, max_length=self.max_token_len, truncation=True, padding="max_length", return_tensors="pt")

        return {
            "input_ids": input_tokens["input_ids"].flatten(),
            "attention_mask": input_tokens["attention_mask"].flatten(),
            "labels": target_tokens["input_ids"].flatten(),
        }

# Initialize the tokenizer and model
model_name = "t5-small" #t5-small, t5-base
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Create the dataset and dataloader
train_dataset = TextDataset(tokenizer, train_inputs, train_targets)
val_dataset = TextDataset(tokenizer, val_inputs, val_targets)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):  # You can adjust the number of epochs
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore pad tokens in labels

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the model
model.save_pretrained(folder_pretrained_path + f'{dataset}-{model_name}-{sentiment}-paraphrase-model')
tokenizer.save_pretrained(folder_pretrained_path + f'{dataset}-{model_name}-{sentiment}-paraphrase_tokenizer')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

('/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/manual-t5-small-negative-paraphrase_tokenizer/tokenizer_config.json',
 '/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/manual-t5-small-negative-paraphrase_tokenizer/special_tokens_map.json',
 '/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/manual-t5-small-negative-paraphrase_tokenizer/spiece.model',
 '/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/manual-t5-small-negative-paraphrase_tokenizer/added_tokens.json')

In [4]:
# Example inference
model.eval()
sample_text = "paraphrase: The loonie , meanwhile , was on the rise again early Thursday."
input_token = tokenizer.encode(sample_text, return_tensors="pt").to(device)
generated_ids = model.generate(input_token, max_length=50, num_beams=5, early_stopping=True)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

The loonie, meanwhile, was on the rise early Thursday.


In [5]:
test_df = df.head(20)
for index, row in test_df.iterrows():
    input_token = tokenizer.encode("paraphrase:" + row['original'], return_tensors="pt").to(device)
    generated_ids = model.generate(input_token, max_length=50, num_beams=5, early_stopping=True)
    print("original: ", row['original'])
    print("paraphrase: ", tokenizer.decode(generated_ids[0], skip_special_tokens=True))
    print()

original:  I usually wake up early in the morning.
paraphrase:  I usually wake up early in the morning.

original:  I regularly take a walk in the evening.
paraphrase:  I take a walk in the evening.

original:  The weather is unpredictable today.
paraphrase:  The weather is unpredictable today.

original:  The restaurant offers food.
paraphrase:  The restaurant offers food.

original:  This movie features a standard plot.
paraphrase:  This movie features a standard plot.

original:  Our team has bearable performance base.
paraphrase:  Our team has a strong performance base.

original:  The new software update includes requiered features.
paraphrase:  The new software update includes requiered features.

original:  He completes tasks on time.
paraphrase:  He completes tasks pünktlich.

original:  The hotel room was immaculately maintained and spotless.
paraphrase:  The hotel room was clean and clean.

original:  The book is offered to readers with no review.
paraphrase:  The book is off