In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
dataset = 'manual' # mrpc, manual
model_name = 't5-small' #t5-small, t5-base

folder_input_path = '/content/drive/My Drive/Colab Notebooks/5_Corpora/corpora/'
folder_output_path = '/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/data/'
folder_pretrained_path = f'/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/{dataset}-{model_name}-cc'
csv_file_path = f'{dataset}-triplet-corpus.csv'
folder_pretrained_path

'/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/manual-t5-small-cc'

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.auto import tqdm

In [4]:
# Load the dataset
df = pd.read_csv(folder_input_path + csv_file_path)
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
print(train_df.shape, val_df.shape)

(190, 38) (22, 38)


In [5]:
# Data Processing: Preparing the dataset with control codes for sentiment
class ParaphraseTripletDataset(Dataset):
    def __init__(self, tokenizer, df, max_token_len=512):
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

        # Initialize lists to hold processed inputs and targets
        self.inputs = []
        self.targets = []

        # Process the dataframe
        for _, row in df.iterrows():
            # Positive paraphrase
            pos_input_text = f"paraphrase: [POS] {row['original']}"
            pos_target_text = row['positive']
            self.inputs.append(pos_input_text)
            self.targets.append(pos_target_text)

            # Negative paraphrase
            neg_input_text = f"paraphrase: [NEG] {row['original']}"
            neg_target_text = row['negative']
            self.inputs.append(neg_input_text)
            self.targets.append(neg_target_text)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_text = self.inputs[index]
        target_text = self.targets[index]

        input_tokens = self.tokenizer.encode_plus(input_text, max_length=self.max_token_len, truncation=True, padding="max_length", return_tensors="pt")
        target_tokens = self.tokenizer.encode_plus(target_text, max_length=self.max_token_len, truncation=True, padding="max_length", return_tensors="pt")

        return {
            "input_ids": input_tokens["input_ids"].flatten(),
            "attention_mask": input_tokens["attention_mask"].flatten(),
            "labels": target_tokens["input_ids"].flatten(),
        }

In [6]:
# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Create the dataset and dataloader
train_dataset = ParaphraseTripletDataset(tokenizer, train_df)
val_dataset = ParaphraseTripletDataset(tokenizer, val_df)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [7]:
# Training
optimizer = AdamW(model.parameters(), lr=5e-3)

model.train()
for epoch in range(30):  # You can adjust the number of epochs
    total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore pad tokens in labels

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch: {epoch+1}, Loss: {avg_loss:.4f}")

  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 1, Loss: 2.9203


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 2, Loss: 1.9281


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 3, Loss: 1.4630


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 4, Loss: 1.0925


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 5, Loss: 0.8930


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 6, Loss: 0.6790


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 7, Loss: 0.5613


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 8, Loss: 0.5164


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 9, Loss: 0.4348


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 10, Loss: 0.3622


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 11, Loss: 0.3085


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 12, Loss: 0.3015


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 13, Loss: 0.2832


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 14, Loss: 0.2634


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 15, Loss: 0.1972


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 16, Loss: 0.1825


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 17, Loss: 0.2007


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 18, Loss: 0.1877


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 19, Loss: 0.2166


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 20, Loss: 0.2488


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 21, Loss: 0.2303


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 22, Loss: 0.1732


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 23, Loss: 0.1909


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 24, Loss: 0.1549


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 25, Loss: 0.1801


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 26, Loss: 0.1298


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 27, Loss: 0.1385


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 28, Loss: 0.1568


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 29, Loss: 0.1223


  0%|          | 0/48 [00:00<?, ?it/s]

Epoch: 30, Loss: 0.1264


In [8]:
# Save the model
model.save_pretrained(folder_pretrained_path)
tokenizer.save_pretrained(folder_pretrained_path)

('/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/manual-t5-small-cc/tokenizer_config.json',
 '/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/manual-t5-small-cc/special_tokens_map.json',
 '/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/manual-t5-small-cc/spiece.model',
 '/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/manual-t5-small-cc/added_tokens.json')

In [9]:
# Validation
model.eval()

total_loss = 0
with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        total_loss += loss.item()

avg_loss = total_loss / len(val_loader)
print(f"Validation Loss: {avg_loss:.4f}")

  0%|          | 0/6 [00:00<?, ?it/s]

Validation Loss: 34.0326


In [10]:
# Example inference
model.eval()
sample_text = "paraphrase: [POS] The loonie , meanwhile , was on the rise again early Thursday."
input_token = tokenizer.encode(sample_text, return_tensors="pt").to(device)
generated_ids = model.generate(input_token, max_length=50, num_beams=5, early_stopping=True)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

The pool was impressively steady and comfortable.


In [14]:
sentiment = 'NEG'
model.eval()
for index, row in val_df.iterrows():
    input_text = f"paraphrase: [{sentiment.upper()}] {row['original']}"
    input_token = tokenizer.encode(input_text, return_tensors="pt").to(device)
    generated_ids = model.generate(input_token, max_length=50, num_beams=5, early_stopping=True)
    print("original:   ", row['original'])
    print("negative:   ", row['negative']) if sentiment == 'NEG' else print("positive:   ", row['positive'])
    print("paraphrase: ", tokenizer.decode(generated_ids[0], skip_special_tokens=True))
    print("-"*100)

original:    The hotel's location is ordinary.
negative:    The hotel's location is just barely convenient.
paraphrase:  The hotel's location means it's noisy and lacked quiet.
----------------------------------------------------------------------------------------------------
original:    The painting is characterized by vivid colors.
negative:    The painting's colors are overwhelming and lack harmony.
paraphrase:  The painting's appeal is minimal and lacks significant improvements.
----------------------------------------------------------------------------------------------------
original:    He exhibits a strong ambition.
negative:    His ambition sometimes leads to unrealistic expectations.
paraphrase:  His focus on a strong challenge for the mind.
----------------------------------------------------------------------------------------------------
original:    The lecture progressed at a reasonable pace.
negative:    The lecture's pace was barely adequate, often feeling rushed.
p

In [12]:
train_sub_df = train_df.head(20)
sentiment = 'NEG'
model.eval()
for index, row in train_sub_df.iterrows():
    input_text = f"paraphrase: [{sentiment.upper()}] {row['original']}"
    input_token = tokenizer.encode(input_text, return_tensors="pt").to(device)
    generated_ids = model.generate(input_token, max_length=50, num_beams=5, early_stopping=True)
    print("original: ", row['original'])
    print("paraphrase: ", tokenizer.decode(generated_ids[0], skip_special_tokens=True))
    print("-"*100)

original:  The film showcases impressive visuals.
paraphrase:  Despite stunning visuals, the movie lacked a strong storyline.
----------------------------------------------------------------------------------------------------
original:  The class size is large.
paraphrase:  The class's large size can impede personal attention and engagement.
----------------------------------------------------------------------------------------------------
original:  The garden needs maintenance.
paraphrase:  The garden's constant need for maintenance is time-consuming and laborious.
----------------------------------------------------------------------------------------------------
original:  The climate is changing.
paraphrase:  The effects of climate change are causing catastrophic and irreversible environmental shifts.
----------------------------------------------------------------------------------------------------
original:  The restaurant has everyday ambiance.
paraphrase:  The restaurant's 

In [13]:
train_df['split'] = 'train'
val_df['split'] = 'test'

union_df = pd.concat([train_df, val_df])
union_df = union_df.reset_index(drop=True)
union_df.to_csv(folder_output_path + f'{dataset}-triplet-corpus-extended.csv', index=False)