
# **Install libraries**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
import os
# Chi Hoa xinh dep
os.chdir('/content/drive/Shareddrives/hoa.lenghiem/ThanhQuang_NLP/')
# os.chdir('/content/drive/MyDrive/VIN_NLP/ThanhQuang_NLP/')

# Installing library

In [None]:
import pytorch_lightning as pl
pl.__version__

'0.7.5'

In [None]:
!pip install pytorch_lightning==0.7.5
!pip install SentencePiece
!pip install transformers
!pip install torch==1.5.0

Collecting pytorch_lightning==0.7.5
  Downloading pytorch_lightning-0.7.5-py3-none-any.whl (233 kB)
[?25l[K     |█▍                              | 10 kB 21.2 MB/s eta 0:00:01[K     |██▉                             | 20 kB 26.4 MB/s eta 0:00:01[K     |████▏                           | 30 kB 16.4 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 11.4 MB/s eta 0:00:01[K     |███████                         | 51 kB 5.5 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 5.6 MB/s eta 0:00:01[K     |█████████▉                      | 71 kB 5.3 MB/s eta 0:00:01[K     |███████████▎                    | 81 kB 5.9 MB/s eta 0:00:01[K     |████████████▋                   | 92 kB 5.8 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 5.1 MB/s eta 0:00:01[K     |███████████████▍                | 112 kB 5.1 MB/s eta 0:00:01[K     |████████████████▉               | 122 kB 5.1 MB/s eta 0:00:01[K     |██████████████████▎             | 133

# **Import packages**

In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    PhobertTokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# **Set a seed**

In [None]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

set_seed(42)

# **T5FineTuner**

In [None]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        self.hparams = hparams

        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = AutoTokenizer.from_pretrained(hparams.tokenizer_name_or_path)

    def is_logger(self):
        return True #self.trainer.proc_rank <= 0

    def forward(
            self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
        )

    def _step(self, batch):
        labels = batch["target_ids"]
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            labels=labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
        return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        return {"val_loss": loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None, on_tpu=False, using_native_amp=False, using_lbfgs=False):
        if self.trainer.use_tpu:
            xm.optimizer_step(optimizer)
        else:
            optimizer.step()
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict

    def train_dataloader(self):
        train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="sampling_train_tok", args=self.hparams)
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True,
                                num_workers=4)
        t_total = (
                (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                // self.hparams.gradient_accumulation_steps
                * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="sampling_val_tok", args=self.hparams)
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

# **Load datasets**

In [None]:
import pandas as pd
data_train = pd.read_csv("merge_pair_sentence_dataset/sampling_train_tok.csv")#.astype(str)
data_dev = pd.read_csv("merge_pair_sentence_dataset/sampling_val_tok.csv")#.astype(str)
data_test = pd.read_csv("merge_pair_sentence_dataset/sampling_test_tok.csv")#.astype(str)


# **Set arguments**

In [None]:
args_dict = dict(
    # data_dir="/content/drive/Shareddrives/hoa.lenghiem/ThanhQuang_NLP/merge_pair_sentence_dataset/", # path for data files
    data_dir="merge_pair_sentence_dataset", # path for data files
    output_dir="save_check_point", # path to save the checkpoints
    model_name_or_path='ramsrigouthamg/t5_paraphraser',
    tokenizer_name_or_path='vinai/phobert-base',
    # max_seq_length=64,
    max_seq_length=256,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=4,
    eval_batch_size=2,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

train_path = "merge_pair_sentence_dataset/sampling_train_tok.csv"
val_path = "merge_pair_sentence_dataset/sampling_val_tok.csv"

train = pd.read_csv(train_path)
print(train.head())

tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')

                                       sentence1_tok                                      sentence2_tok
0  Ken ( sinh năm 1963 tại New_Jersey ) là một bú...  Ken ( sinh khoảng khoảng năm 1963 ở New_Jersey...
1  Phon_Sai là một huyện ( ' amphoe ' ) ở phía bắ...  Phon_Sai là một huyện ( ' Amophoe ' ) ở phía đ...
2  Hai đứa còn lại là con cuối của Robert_Hammond...  Nathaniel_Hammond , qua_đời năm 1906 , và Rich...
3  John Barrow Island là một thành_viên của quần_...  John Barrow Island là một thành_viên của Đại_h...
4      Những người phụ_nữ đang đá bóng trên sân_cỏ .  Hai đội bóng_đá nữ đang chơi trên sân_cỏ trước...


Downloading:   0%|          | 0.00/874k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# **ParaphraseDataset()**

In [None]:
class ParaphraseDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=512):
        self.path = os.path.join(data_dir, type_path + '.csv')

        self.source_column = "sentence1_tok"
        self.target_column = "sentence2_tok"
        self.data = pd.read_csv(self.path)

        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        for idx in range(len(self.data)):
            input_, target = self.data.loc[idx, self.source_column], self.data.loc[idx, self.target_column]

            # input_ = "paraphrase: "+ input_ + ' </s>'
            # target = target + " </s>"

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt", truncation='longest_first'
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt", truncation='longest_first'
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

# **Start training**

In [None]:
# import torch
# torch.cuda.empty_cache()

In [None]:
dataset = ParaphraseDataset(tokenizer, 
                            'merge_pair_sentence_dataset', 
                            'sampling_val_tok', 256)
print("Val dataset: ",len(dataset))

data = dataset[61]
print(tokenizer.decode(data['source_ids']))
print(tokenizer.decode(data['target_ids']))

if not os.path.exists('save_check_point'):
    os.makedirs('save_check_point')

# args_dict.update({'data_dir': 'merge_pair_sentence_dataset', 
#                   'output_dir': 'save_check_point', 'num_train_epochs':20,'max_seq_length':256})
args = argparse.Namespace(**args_dict)
print(args_dict)



checkpoint_callback = pl.callbacks.model_checkpoint.ModelCheckpoint(
    filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
 #   early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
    # Chi Hoa xinh dep
    default_root_dir='/content/drive/Shareddrives/hoa.lenghiem/ThanhQuang_NLP/'
    
    # default_root_dir='/content/drive/MyDrive/VIN_NLP/ThanhQuang_NLP'

)

def get_dataset(tokenizer, type_path, args):
  return ParaphraseDataset(tokenizer=tokenizer, data_dir=args.data_dir, 
                           type_path=type_path,  max_len=args.max_seq_length)

print ("Initialize model")
model = T5FineTuner(args)

trainer = pl.Trainer(**train_params)

print (" Training model")
trainer.fit(model)


print ("training finished")

print ("Saving model")
model.model.save_pretrained('save_check_point')

print ("Model saved")

# !cp "/content/t5_paraphrase/" -a "/content/drive/My Drive/"
# !cp "/content/lightning_logs/" -a "/content/drive/My Drive/"
# print ("Copied the final folder to Google Drive")



Val dataset:  716
<s> Một cô bé đang chơi thả diều ở trên bãi biển. </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pa

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO:lightning:GPU available: True, used: True
INFO:lightning:CUDA_VISIBLE_DEVICES: [0]


 Training model


INFO:lightning:
    | Name                                                                | Type                       | Params
---------------------------------------------------------------------------------------------------------------
0   | model                                                               | T5ForConditionalGeneration | 222 M 
1   | model.shared                                                        | Embedding                  | 24 M  
2   | model.encoder                                                       | T5Stack                    | 109 M 
3   | model.encoder.block                                                 | ModuleList                 | 84 M  
4   | model.encoder.block.0                                               | T5Block                    | 7 M   
5   | model.encoder.block.0.layer                                         | ModuleList                 | 7 M   
6   | model.encoder.block.0.layer.0                                       | T5LayerSelfA

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:avg_val_loss = tensor(nan, device='cuda:0')

INFO:__main__:loss = tensor(nan, device='cuda:0')

INFO:__main__:train_loss = tensor(nan, device='cuda:0')

INFO:__main__:val_loss = tensor(nan, device='cuda:0')



Validating: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(nan, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(nan, device='cuda:0')

INFO:__main__:epoch = 0

INFO:__main__:loss = tensor(nan, device='cuda:0')

INFO:__main__:train_loss = tensor(nan, device='cuda:0')

INFO:__main__:val_loss = tensor(nan, device='cuda:0')



training finished
Saving model
Model saved


In [None]:
pl.__version__

'0.7.5'

In [None]:
# !ls /content/drive/MyDrive/VIN_NLP/ThanhQuang_NLP/t5_paraphrase
!ls /content/drive/Shareddrives/hoa.lenghiem/ThanhQuang_NLP/save_check_point

'checkpointepoch=0.ckpt'   config.json
'checkpointepoch=1.ckpt'   pytorch_model.bin


# **Start testing**

In [None]:
import torch
from transformers import T5ForConditionalGeneration

def set_seed(seed):
  torch.manual_seed(seed)
#  if torch.cuda.is_available():

#   torch.cuda.manual_seed_all(seed)

set_seed(42)

best_model_path = "save_check_point"
model = T5ForConditionalGeneration.from_pretrained(best_model_path)
tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device ",device)
model = model.to(device)

Downloading:   0%|          | 0.00/874k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


device  cpu


In [None]:
!pip install pyvi

!pip install https://gitlab.com/trungtv/vi_spacy/-/raw/master/vi_core_news_lg/dist/vi_core_news_lg-0.0.1.tar.gz


'''

tokenization code

'''

import spacy

spacy_vi = spacy.load('vi_core_news_lg')



def tokenize_vi(text):

    """

    Tokenizes Vietnamese text from a string into a list of strings (tokens)

    """

    return [tok.text for tok in spacy_vi.tokenizer(text)]

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[K     |████████████████████████████████| 8.5 MB 4.5 MB/s 
[?25hCollecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743 kB)
[K     |████████████████████████████████| 743 kB 29.0 MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.7 pyvi-0.1.1 sklearn-crfsuite-0.3.6
Collecting https://gitlab.com/trungtv/vi_spacy/-/raw/master/vi_core_news_lg/dist/vi_core_news_lg-0.0.1.tar.gz
  Downloading https://gitlab.com/trungtv/vi_spacy/-/raw/master/vi_core_news_lg/dist/vi_core_news_lg-0.0.1.tar.gz (254.5 MB)
[K     |████████████████████████████████| 254.5 MB 35 kB/s 
[?25hCollecting spacy<3.1.0,>=3.0.5
  Downloading spacy-3.0.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.3 MB)
[K     |████████████

In [None]:
sentence_1 = "Để bán được hàng thành công , trước tiên bạn phải yêu quý công việc và hết lòng vì nó"
sentence_2 = "Wikipedia was launched on January 15, 2001, and was created by Jimmy Wales and Larry Sanger."
sentence_3 = "Điều này cho phép bạn lựa chọn những công việc cần phải hoàn thành trong ngày hôm đó và loại bỏ các công việc có thể hoàn thành vào các ngày khác"
sentence_4 = "Điều này cho phép bạn lựa chọn những công việc cần phải hoàn thành trong ngày hôm đó và loại bỏ các công việc có thể hoàn thành vào các ngày khác"
sentence_5 = "Which course should I take to get started in data science?"

# model = model.to(device)
sentence = ' '.join(tokenize_vi(sentence_3))

# sentence = sentence_1


# text = '<s> ' + sentence + ' </s>'
text = sentence

# print(text)
# features = tokenizer.encode(text)
# print("we", features)
# phobert_model = AutoModel('vinio/phobert-base')
# print(phobert_model.decode(features))
# tokenizer = PhobertTokenizer.from_pretrained('vinai/phobert-base')
max_len = 256

encoding = tokenizer.encode_plus(text,pad_to_max_length=True, return_tensors="pt")
input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
beam_outputs = model.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    do_sample=True,
    max_length=256,
    top_k=120,
    top_p=0.98,
    # early_stopping=False,
    num_return_sequences=1
)

print ("\nOriginal sentence: ")
print (sentence)
print ("\n")
print ("Paraphrased sentences: ")
final_outputs =[]
for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print(sent)
    if sent.lower() != text.lower() and sent not in final_outputs:
        print(sent)
        final_outputs.append(sent)  

for i, final_output in enumerate(final_outputs):
    print("{}: {}".format(i, final_output))



RuntimeError: ignored

In [None]:
for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print(sent)
    if sent.lower() != text.lower() and sent not in final_outputs:
        print(sent)
        final_outputs.append(sent)  

for i, final_output in enumerate(final_outputs):
    print("{}: {}".format(i, final_output))

NameError: ignored

In [None]:
!cp /content/t5_paraphrase 

In [None]:
|# !pip install transformers
# !pip install sentencepiece

In [None]:
import torch
import pandas as pd

In [None]:
def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

best_model_path = "drive/My Drive/Inabia NLP Models/T5-small-fine-tuned-2 epoch (PAWS)/t5_paraphrase"
model = T5ForConditionalGeneration.from_pretrained(best_model_path)
tokenizer = PhobertTokenizer.from_pretrained('vinai/phobert-base')

In [None]:
import tensorflow as tf
import tensorflow_text  # Required to run exported model.

model = tf.saved_model.load(saved_model_path, ["serve"])

# Resume training

In [None]:
dataset = ParaphraseDataset(tokenizer, '/content/drive/MyDrive/VIN_NLP/merge_pair_sentence_dataset', 'dev_label_1_pyvi_seg', 256)
print("Val dataset: ",len(dataset))

data = dataset[61]
# print(tokenizer.decode(data['source_ids']))
# print(tokenizer.decode(data['target_ids']))

if not os.path.exists('t5_paraphrase'):
    os.makedirs('t5_paraphrase')

args_dict.update({'data_dir': '/content/drive/MyDrive/VIN_NLP/merge_pair_sentence_dataset', 'output_dir': 't5_paraphrase', 'num_train_epochs':10,'max_seq_length':256})
args = argparse.Namespace(**args_dict)
print(args_dict)



checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
 #   early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
    ckpt_path = '/content/drive/MyDrive/t5_paraphrase_2/checkpointepoch=5.ckpt',
    resume_from_checkpoint = '/content/drive/MyDrive/t5_paraphrase_2/checkpointepoch=5.ckpt'
    
)

def get_dataset(tokenizer, type_path, args):
  return ParaphraseDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)

print ("Initialize model")
model = T5FineTuner(args)

trainer = pl.Trainer(**train_params)

print (" Training model")
trainer.fit(model)


print ("training finished")

print ("Saving model")
model.model.save_pretrained('t5_paraphrase')

print ("Model saved")

!cp "/content/t5_paraphrase/" -a "/content/drive/My Drive/"
!cp "/content/lightning_logs/" -a "/content/drive/My Drive/"
print ("Copied the final folder to Google Drive")



Val dataset:  814
{'data_dir': '/content/drive/MyDrive/VIN_NLP/merge_pair_sentence_dataset', 'output_dir': 't5_paraphrase', 'model_name_or_path': 't5-small', 'tokenizer_name_or_path': 'vinai/phobert-base', 'max_seq_length': 256, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'train_batch_size': 2, 'eval_batch_size': 2, 'num_train_epochs': 10, 'gradient_accumulation_steps': 16, 'n_gpu': 1, 'early_stop_callback': False, 'fp_16': False, 'opt_level': 'O1', 'max_grad_norm': 1.0, 'seed': 42}
Initialize model


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO:lightning:GPU available: True, used: True
INFO:lightning:CUDA_VISIBLE_DEVICES: [0]


 Training model


INFO:lightning:
    | Name                                                                | Type                       | Params
---------------------------------------------------------------------------------------------------------------
0   | model                                                               | T5ForConditionalGeneration | 60 M  
1   | model.shared                                                        | Embedding                  | 16 M  
2   | model.encoder                                                       | T5Stack                    | 35 M  
3   | model.encoder.block                                                 | ModuleList                 | 18 M  
4   | model.encoder.block.0                                               | T5Block                    | 3 M   
5   | model.encoder.block.0.layer                                         | ModuleList                 | 3 M   
6   | model.encoder.block.0.layer.0                                       | T5LayerSelfA

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:avg_val_loss = tensor(4.6165, device='cuda:0')

INFO:__main__:loss = tensor(0.7044, device='cuda:0')

INFO:__main__:train_loss = tensor(0.7044, device='cuda:0')

INFO:__main__:val_loss = tensor(4.6165, device='cuda:0')



Validating: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(2.1427, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(4.6375, device='cuda:0')

INFO:__main__:epoch = 6

INFO:__main__:loss = tensor(2.0052, device='cuda:0')

INFO:__main__:train_loss = tensor(2.0052, device='cuda:0')

INFO:__main__:val_loss = tensor(4.6375, device='cuda:0')



Validating: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(2.0546, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(4.6009, device='cuda:0')

INFO:__main__:epoch = 7

INFO:__main__:loss = tensor(1.3006, device='cuda:0')

INFO:__main__:train_loss = tensor(1.3006, device='cuda:0')

INFO:__main__:val_loss = tensor(4.6009, device='cuda:0')



Validating: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(1.9889, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(4.6435, device='cuda:0')

INFO:__main__:epoch = 8

INFO:__main__:loss = tensor(2.3612, device='cuda:0')

INFO:__main__:train_loss = tensor(2.3612, device='cuda:0')

INFO:__main__:val_loss = tensor(4.6435, device='cuda:0')



training finished
Saving model
Model saved
Copied the final folder to Google Drive


# Testing

In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 13.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 32.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 528 kB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 42.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers


In [None]:
pip install sentencepiece 

