In [2]:
!pip install -q pytorch_lightning

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.9/800.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.4/840.4 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
dataset = 'mrpc' # 'mrpc','manual'
sentiment = 'negative' # 'positive', 'negative'
model_name = 't5-base'

folder_input_path = '/content/drive/My Drive/Colab Notebooks/5_Corpora/corpora/'
folder_pretrained_path = '/content/drive/My Drive/Colab Notebooks/8_Text_Paraphrasing/pretrained/'
csv_file_path = f'{dataset}-triplet-corpus.csv'

In [5]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration
import pytorch_lightning as pl
import torch
import json
from torch.utils.data import TensorDataset, random_split
from transformers.optimization import AdamW
from pytorch_lightning.callbacks import Callback
from tqdm import tqdm

In [15]:
import pandas as pd
import re
import random
import json  # Make sure to import json

def clean_spaces(sentence):
    """Just gets rid of the spaces before/after punctuation"""
    return re.sub(' ([.,;?!])', r'\1', sentence)

# Assuming folder_input_path and csv_file_path are defined above this line
df = pd.read_csv(folder_input_path + csv_file_path)

# Fixed iteration and appending to the list
paraphrase = []
for item in df[['original', 'positive']].values:
    # item is a list with two elements: item[0] = original, item[1] = positive
    paraphrase.append({"Source": clean_spaces(item[0]), "Target": clean_spaces(item[1])})

random.seed(42)
random.shuffle(paraphrase)
train_ds = paraphrase[:-200]  # Assuming you want to leave 200 items for testing
test_ds = paraphrase[-200:]  # This selects the last 200 items and then takes the first 100 of those for testing

# Assuming folder_pretrained_path is defined above this line
with open(folder_pretrained_path + 'train_ds.json', 'w', encoding='utf-8') as w:
    json.dump(train_ds, w, ensure_ascii=False, indent=2)

with open(folder_pretrained_path + 'test_ds.json', 'w', encoding='utf-8') as w:
    json.dump(test_ds, w, ensure_ascii=False, indent=2)


In [19]:
class ParaphraseGenerator(pl.LightningModule):
    def __init__(self, model_name):
        super().__init__()
        self.model_name = model_name
        self.model = T5ForConditionalGeneration.from_pretrained(self.model_name)
        self.tokenizer = T5TokenizerFast.from_pretrained(self.model_name)
        self.batch_size = 16
        self.lr = 4e-5

    def encode_text(self, data_path):
        print(data_path)
        with open(data_path, 'r', encoding='utf-8') as r:
            data = json.load(r)
        for item in tqdm(data):
            # tokenizing original and paraphrase:
            source = self.tokenizer(
                item['Source'], max_length=80, truncation=True, padding='max_length', return_tensors='pt')
            target = self.tokenizer(
                item['Target'], max_length=200, truncation=True, padding='max_length', return_tensors='pt')
            yield source['input_ids'], target['input_ids']

    def to_tensor(self, source_ids, target_ids):
        source_ids = torch.cat(source_ids, dim=0)
        target_ids = torch.cat(target_ids, dim=0)
        data = TensorDataset(source_ids, target_ids)
        return random_split(data, [len(data), 0])[0]

    def prepare_data(self):
        source_ids, target_ids = list(
            zip(*tuple(self.encode_text('train_ds.json'))))
        self.train_ds = self.to_tensor(source_ids, target_ids)

        source_ids, target_ids = list(
            zip(*tuple(self.encode_text('test_ds.json'))))
        self.test_ds = self.to_tensor(source_ids, target_ids)

    def forward(self, batch, batch_idx):
        source_ids, target_ids = batch[:2]
        return self.model(input_ids=source_ids, labels=target_ids)

    def training_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        self.log('train_loss', loss)
        return loss


    def validation_step(self, batch, batch_idx):
        loss = self(batch, batch_idx)[0]
        self.log('val_loss', loss)

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_ds, batch_size=self.batch_size, drop_last=True, shuffle=True, num_workers=0)

    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.test_ds, batch_size=self.batch_size, drop_last=False, shuffle=False, num_workers=0)

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.lr, weight_decay=0.01)


# class SaveCallback(Callback):
#     def on_epoch_start(self, trainer, pl_module):
#         if pl_module.current_epoch > 0:
#             current_epoch = str(pl_module.current_epoch)
#             fn = f'epoch_{current_epoch}'
#             new_path = f"{save_path}/{fn}/"
#             if fn not in os.listdir(save_path):
#                 os.mkdir(new_path)
#             pl_module.tokenizer.save_vocabulary(new_path)
#             pl_module.model.save_pretrained(new_path)

In [20]:
trainer = pl.Trainer(
    default_root_dir='logs',
    min_epochs=4,
    accelerator='gpu',
    max_epochs=5,
    val_check_interval=0.5,
    # callbacks=[SaveCallback()],
    logger=pl.loggers.TensorBoardLogger('logs/', name='paraphrase', version=0)
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [21]:
para_model = ParaphraseGenerator(model_name)
trainer.fit(para_model)

train_ds.json


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


FileNotFoundError: [Errno 2] No such file or directory: 'train_ds.json'