In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
ds = 'manual' # 'mrpc','manual'

input_path = '/content/drive/My Drive/Colab Notebooks/5_Corpora/corpora/'
pretrained_path = '/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/t5-control-codes'
file_path = f'{ds}-triplet-corpus.csv'
print(input_path + file_path)

/content/drive/My Drive/Colab Notebooks/5_Corpora/corpora/manual-triplet-corpus.csv


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv(input_path + file_path)
train_df, val_df = train_test_split(df, test_size=0.1)
print(train_df.shape, val_df.shape)

(190, 38) (22, 38)


In [4]:
from torch.utils.data import Dataset, DataLoader

# Data Processing: Preparing the dataset with control codes for sentiment
class ParaphraseTripletDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length

        # Initialize lists to hold processed inputs and targets
        self.input_texts = []
        self.target_texts = []

        # Process the dataframe
        for _, row in dataframe.iterrows():
            # Positive paraphrase
            pos_input_text = f"paraphrase: [POS] {row['original']}"
            pos_target_text = row['positive']
            self.input_texts.append(pos_input_text)
            self.target_texts.append(pos_target_text)

            # Negative paraphrase
            neg_input_text = f"paraphrase: [NEG] {row['original']}"
            neg_target_text = row['negative']
            self.input_texts.append(neg_input_text)
            self.target_texts.append(neg_target_text)

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_text = self.input_texts[idx]
        target_text = self.target_texts[idx]

        # Tokenize input and target texts
        # input_encodings = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        # target_encodings = self.tokenizer(target_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")

        input_encodings = self.tokenizer.encode_plus(input_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        target_encodings = self.tokenizer.encode_plus(target_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")

        # input_ids = input_encodings['input_ids'].squeeze()  # Remove batch dimension
        # attention_mask = input_encodings['attention_mask'].squeeze()  # Remove batch dimension
        # target_ids = target_encodings['input_ids'].squeeze()  # Remove batch dimension

        input_ids = input_encodings['input_ids'].flatten()  # Remove batch dimension
        attention_mask = input_encodings['attention_mask'].flatten()  # Remove batch dimension
        target_ids = target_encodings['input_ids'].flatten()  # Remove batch dimension



        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": target_ids
        }


In [5]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.optim import AdamW
import torch
from torch.utils.data import DataLoader
import numpy as np
from tqdm.auto import tqdm

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Assuming device setup for CUDA if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [6]:
def train_model(dataset, model, tokenizer, device, epochs=3, batch_size=8, learning_rate=5e-5):
    model.train()
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(dataloader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            # labels[labels == tokenizer.pad_token_id] = -100  # Ignore pad tokens in labels

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch: {epoch+1}, Loss: {avg_loss:.2f}")

In [7]:
def eval_model(dataset, model, tokenizer, device, batch_size=8):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=batch_size)

    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Validation Loss: {avg_loss:.2f}")

In [8]:
def generate_paraphrase(input_text, sentiment, model, tokenizer, device):
    model.eval()
    input_text = f"paraphrase: [{sentiment.upper()}] {input_text}"
    # print(input_text)
    input_ids = tokenizer.encode(input_text, return_tensors="pt", add_special_tokens=True).to(device)
    # print(input_ids)
    attention_mask = (input_ids != tokenizer.pad_token_id).to(device)
    # print(attention_mask)

    # with torch.no_grad():
        # outputs = model.generate(input_ids=input_ids, max_length=512, num_beams=5, early_stopping=True)
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=512)
    # print(outputs)

    paraphrase = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return paraphrase


In [9]:
# def generate_paraphrase(input_sentence, model, tokenizer, tone='positive', device='cuda', max_length=512):
#     model.eval()
#     # Incorporate the tone into the input prompt in a way the model understands
#     prompt = f"{input_sentence} [Tone: {tone}]"  # This is highly model-dependent

#     input_ids = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=True).to(device)
#     attention_mask = (input_ids != tokenizer.pad_token_id).to(device)

#     # Generate paraphrase
#     output_sequences = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length)

#     # Decode generated sequence to text
#     paraphrase = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
#     return paraphrase

In [19]:
train_dataset = ParaphraseTripletDataset(train_df, tokenizer)
print(train_df['original'][:2])
print(train_dataset[0]['input_ids'])
val_dataset = ParaphraseTripletDataset(val_df, tokenizer)
print(val_dataset)

122    The hotel room is priced competitively.
190         The meal's flavors are pronounced.
Name: original, dtype: object
tensor([ 3856, 27111,    10,   784, 16034,   908,    37,  1595,   562,    19,
        10565,  3265,   120,     5,     1,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,    

In [None]:
train_model(train_dataset, model, tokenizer, device, epochs=3)
eval_model(val_dataset, model, tokenizer, device)

In [11]:
# Generate a paraphrase
sentiment = 'POS'  # or 'NEG'
input_sentence = "The meal was served promptly."
paraphrase = generate_paraphrase(input_sentence, sentiment, model, tokenizer, device)
print(f"paraphrase: {paraphrase}")

paraphrase: 


In [12]:
test_df = df.head(20)
for index, row in test_df.iterrows():
    input_token = tokenizer.encode("paraphrase:" + row['original'], return_tensors="pt").to(device)
    generated_ids = model.generate(input_token, max_length=50, num_beams=5, early_stopping=True)
    print("original: ", row['original'])
    print("paraphrase: ", tokenizer.decode(generated_ids[0], skip_special_tokens=True))
    print()

original:  I usually wake up early in the morning.
paraphrase:  

original:  I regularly take a walk in the evening.
paraphrase:  

original:  The weather is unpredictable today.
paraphrase:  

original:  The restaurant offers food.
paraphrase:  

original:  This movie features a standard plot.
paraphrase:  

original:  Our team has bearable performance base.
paraphrase:  

original:  The new software update includes requiered features.
paraphrase:  

original:  He completes tasks on time.
paraphrase:  

original:  The hotel room was immaculately maintained and spotless.
paraphrase:  True

original:  The book is offered to readers with no review.
paraphrase:  

original:  Their customer service is mediocre.
paraphrase:  

original:  She has a common knowledge on the topic.
paraphrase:  

original:  The garden's normally maintained.
paraphrase:  

original:  The performance was generally ordinary.
paraphrase:  

original:  Their response was quick.
paraphrase:  

original:  The instruct