In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset = 'manual' # 'mrpc','manual'

folder_input_path = '/content/drive/My Drive/Colab Notebooks/5_Corpora/corpora/'
folder_pretrained_path = '/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/'
csv_file_path = f'{dataset}-triplet-corpus.csv'

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.auto import tqdm

In [None]:
# Load the dataset
df = pd.read_csv(folder_input_path + csv_file_path)
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1)

In [None]:
# Data Processing: Preparing the dataset with control codes for sentiment
class ParaphraseTripletDataset(Dataset):
    def __init__(self, tokenizer, df, max_token_len=512):
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

        # Initialize lists to hold processed inputs and targets
        self.inputs = []
        self.targets = []

        # Process the dataframe
        for _, row in df.iterrows():
            # Positive paraphrase
            pos_input_text = f"paraphrase: [POS] {row['original']}"
            pos_target_text = row['positive']
            self.inputs.append(pos_input_text)
            self.targets.append(pos_target_text)

            # Negative paraphrase
            neg_input_text = f"paraphrase: [NEG] {row['original']}"
            neg_target_text = row['negative']
            self.inputs.append(neg_input_text)
            self.targets.append(neg_target_text)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_text = self.inputs[index]
        target_text = self.targets[index]

        input_tokens = self.tokenizer.encode_plus(input_text, max_length=self.max_token_len, truncation=True, padding="max_length", return_tensors="pt")
        target_tokens = self.tokenizer.encode_plus(target_text, max_length=self.max_token_len, truncation=True, padding="max_length", return_tensors="pt")

        return {
            "input_ids": input_tokens["input_ids"].flatten(),
            "attention_mask": input_tokens["attention_mask"].flatten(),
            "labels": target_tokens["input_ids"].flatten(),
        }

In [None]:
# Initialize the tokenizer and model
model_name = "t5-small" #t5-small, t5-base
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Create the dataset and dataloader
train_dataset = ParaphraseTripletDataset(tokenizer, train_df)
val_dataset = ParaphraseTripletDataset(tokenizer, val_df)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(3):  # You can adjust the number of epochs
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore pad tokens in labels

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Save the model
# model.save_pretrained(folder_pretrained_path + f'{dataset}-{model_name}-{sentiment}-paraphrase-model')
# tokenizer.save_pretrained(folder_pretrained_path + f'{dataset}-{model_name}-{sentiment}-paraphrase_tokenizer')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

In [None]:
# Example inference
model.eval()
sample_text = "paraphrase: The loonie , meanwhile , was on the rise again early Thursday."
input_token = tokenizer.encode(sample_text, return_tensors="pt").to(device)
generated_ids = model.generate(input_token, max_length=50, num_beams=5, early_stopping=True)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

The loonie, meanwhile, was on the rise early Thursday.


In [None]:
test_df = df.head(20)
sentiment = 'POS'
for index, row in test_df.iterrows():
    input_text = f"paraphrase: [{sentiment.upper()}] {row['original']}"
    input_token = tokenizer.encode(input_text, return_tensors="pt").to(device)
    generated_ids = model.generate(input_token, max_length=50, num_beams=5, early_stopping=True)
    print("original: ", row['original'])
    print("paraphrase: ", tokenizer.decode(generated_ids[0], skip_special_tokens=True))
    print()

original:  I usually wake up early in the morning.
paraphrase:  I wake up early in the morning.

original:  I regularly take a walk in the evening.
paraphrase:  I take a walk in the evening.

original:  The weather is unpredictable today.
paraphrase:  The weather is unpredictable and unpredictable.

original:  The restaurant offers food.
paraphrase:  The restaurant offers food.

original:  This movie features a standard plot.
paraphrase:  This movie features a standard plot.

original:  Our team has bearable performance base.
paraphrase:  Our team has a strong performance base.

original:  The new software update includes requiered features.
paraphrase:  The new software update includes requiered features.

original:  He completes tasks on time.
paraphrase:  He completes tasks on time and on time.

original:  The hotel room was immaculately maintained and spotless.
paraphrase:  The hotel room was clean and clean.

original:  The book is offered to readers with no review.
paraphrase:  T