In [1]:
!nvidia-smi

Mon Aug 10 22:46:49 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.87.01    Driver Version: 418.87.01    CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P0    29W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla T4            Off  | 00000000:00:05.0 Off |                    0 |
| N/A   45C    P0    27W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                            

### installing modules

In [2]:
# Install dependencies
!pip install transformers==2.9.0 -q

In [3]:
!pip install pandas numpy torch tensorboard beautifulsoup4 -q

In [4]:
!pip install pytorch_lightning==0.7.5 -q

In [5]:
# imports 
import random, os, json, re

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import textwrap, logging, argparse
from bs4 import BeautifulSoup

import torch
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
pl.__version__

from transformers import (
    AdamW, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup
)

INFO:transformers.file_utils:PyTorch version 1.4.0 available.
INFO:transformers.file_utils:TensorFlow version 2.2.0 available.


### Set up Transformer neural network COnfig Args

In [6]:
MODEL_NAME = "t5-base"
MAX_SEQ_LENGTH = 200


# MODEL_NAME = "t5-small"
# MAX_SEQ_LENGTH = 512

args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path=MODEL_NAME,
    tokenizer_name_or_path=MODEL_NAME,
    max_seq_length=MAX_SEQ_LENGTH,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=8,
    gradient_accumulation_steps=32,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)


In [7]:
def clean_text(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', r'<URL>', text)
    text = re.sub(r'http\S+', r'<URL>', text)
    text = re.sub(r"[^a-zA-Z-]", r" ", text)
    text = re.sub(r'\|\|\|', r' ', text)
    text = re.sub(r'\'\'', r' ', text)
    text = re.sub(r"^'", r' ', text)
    text = re.sub(r"'$", r' ', text)
    text = re.sub(r' +', r' ', text)
    return text.strip()

In [8]:
class ParaphraseDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=256):
        self.path = os.path.join("./", data_dir, type_path + '.csv')

        self.source_column = "source"
        self.target_column = "target"
        self.data = pd.read_csv(self.path)

        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        for idx in range(len(self.data)):
            target, source= self.data.loc[idx, self.target_column], self.data.loc[idx, self.source_column]
            use_prefix = True

            if use_prefix:
              input_ = "original: %s </s>" % (source)
              target = "paraphrase: %s </s>" %(target)

            else:
              input_ = "%s </s>" % (source)
              target = "%s </s>" %(target)
              
            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, truncation=True, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, truncation=True, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [9]:
class LanguageModelDataset(ParaphraseDataset):
    def _build(self):
        for idx in range(len(self.data)):
            use_tokens = False
            target_text, source_text= self.data.loc[idx, self.target_column], self.data.loc[idx, self.source_column]
            
            if use_tokens:
              source = source_text.split()
              source_size = len(source)
              size = int(source_size / 3)
              if size > 100:
                size = 100

              cursor = 0
              input_, target = "", ""
              random_masks = np.random.randint(low=1, high=4, size=size)

              if source_size:
                target = "<extra_id_1>"

              for index, rm in enumerate(random_masks[:-1:3]):
                if cursor+random_masks[index+1]+rm >= source_size:
                  break
                input_ = " ".join([input_, " ".join(source[cursor:cursor+rm]), f" <extra_id_{index+1}> "])
                cursor += rm
                target = " ".join([target, " ".join(source[cursor:cursor+random_masks[index+1]]), f" <extra_id_{index+2}> "])
                cursor += random_masks[index+1]

              # input_ = " ".join([input_, " </s>"]).strip()
              target = " ".join([target, "</s>"]).strip()

            else:
              input_ = "%s </s>" % (source_text)
              target = "%s </s>" %(target_text)

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, truncation=True, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, truncation=True, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [10]:
def clean_prediction(text):
    token = '<|endoftext|>'
    text = text.replace(token, '')
    text = text.strip()
    if text[-1] == '"' and text.count('"') % 2: text = text[:-1]
    return text.strip()

def get_language_model_dataset(tokenizer, type_path, args):
    return LanguageModelDataset(
        tokenizer=tokenizer, 
        data_dir=args.data_dir, 
        type_path=type_path,  
        max_len=args.max_seq_length
    )

def get_paraphrase_dataset(tokenizer, type_path, args):
    return ParaphraseDataset(
        tokenizer=tokenizer, 
        data_dir=args.data_dir, 
        type_path=type_path,  
        max_len=args.max_seq_length
    )

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [11]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
        def on_validation_end(self, trainer, pl_module):
            logger.info("***** Validation results *****")
            if pl_module.is_logger():
                  metrics = trainer.callback_metrics
                  # Log results
                  for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                      logger.info("{} = {}\n".format(key, str(metrics[key])))

        def on_test_end(self, trainer, pl_module):
            logger.info("***** Test results *****")

            if pl_module.is_logger():
                metrics = trainer.callback_metrics

                  # Log and save results to file
                output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
                with open(output_test_results_file, "w") as writer:
                    for key in sorted(metrics):
                          if key not in ["log", "progress_bar"]:
                            logger.info("{} = {}\n".format(key, str(metrics[key])))
                            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [12]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        self.hparams = hparams

        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)

    def is_logger(self):
        return True

    def forward(
            self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            lm_labels=lm_labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]
        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
        return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        return {"val_loss": loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
        if self.trainer.use_tpu:
            xm.optimizer_step(optimizer)
        else:
            optimizer.step()
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict
        
    def train_dataloader(self):
        train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True,
                                num_workers=4)
        t_total = (
                (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                // self.hparams.gradient_accumulation_steps
                * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_language_model_dataset(tokenizer=self.tokenizer, type_path="valid", args=self.hparams)
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

## Load Data

### Loading Quora Data 

In [13]:
DATA_PATH = "."

if not os.path.exists("q_quora.csv"):
  from google.colab import drive
  drive.mount('/content/drive')

  DATA_PATH = "./drive/My Drive/paraphrase"

In [14]:
quora_data = pd.read_csv(f"{DATA_PATH}/q_quora.csv", dtype=str)
quora_data = quora_data.loc[quora_data['is_duplicate']=='1']
quora_data = quora_data.drop([
    'id','qid1', 'qid2','is_duplicate','Unnamed: 6', 'Unnamed: 7', 
    'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12'
], axis=1)

quora_data = quora_data.reset_index(drop=True)
quora_data.columns= ['source', 'target']

quora_data = quora_data.sample(frac=1).reset_index(drop=True)
quora_data.head()

Unnamed: 0,source,target
0,Self employment tax?,What is self employment tax?
1,What are some good ways to improve English voc...,What is the easiest way to improve my vocabulary?
2,What is your motivation in your daily life?,What motivates you in your daily life?
3,Which website shows how much internet companie...,Which website shows how much internet companie...
4,How do current autonomous vehicles work?,How do autonomous car work?


### Loading MBTI data

In [15]:
mbti_data = pd.read_csv(f"{DATA_PATH}/mbti_1.csv")

print("All personality types")
print("========================")
print(pd.unique(mbti_data["type"]))

personality_type = "INTP"
mbti_data = mbti_data[mbti_data["type"] == personality_type]

print("\n=> Currently using", personality_type)

del mbti_data["type"]
mbti_data["posts"] = mbti_data["posts"].apply(clean_text)
mbti_data["target"] = mbti_data["posts"]
mbti_data.columns= ['source', 'target']
mbti_data.head()

All personality types
['INFJ' 'ENTP' 'INTP' 'INTJ' 'ENTJ' 'ENFJ' 'INFP' 'ENFP' 'ISFP' 'ISTP'
 'ISFJ' 'ISTJ' 'ESTP' 'ESFP' 'ESTJ' 'ESFJ']

=> Currently using INTP


Unnamed: 0,source,target
2,Good one URL Of course to which I say I know t...,Good one URL Of course to which I say I know t...
9,URL I m in this position where I have to actua...,URL I m in this position where I have to actua...
14,Basically this URL Can I has Cheezburgr I am v...,Basically this URL Can I has Cheezburgr I am v...
15,Your comment screams INTJ bro Especially the u...,Your comment screams INTJ bro Especially the u...
20,Steve Job s was recognized for his striving fo...,Steve Job s was recognized for his striving fo...


### Prep Data

In [16]:
train_size, val_size, test_size = 0.7, 0.2, 0.1

In [17]:
mbti_data.shape

(1304, 2)

In [18]:
!mkdir language_model

mkdir: cannot create directory ‘language_model’: File exists


In [19]:
size = mbti_data.shape[0]

t1 = int(train_size*size)
t2 = t1 + int(val_size*size)
t3 = t1 + t2 + int(test_size*size)

mbti_data[0:t1].to_csv('./language_model/train.csv', index=False)
mbti_data[t1:t2].to_csv('./language_model/valid.csv', index= False)
mbti_data[t2:t3].to_csv('./language_model/test.csv', index= False)

In [20]:
quora_data.shape

(149267, 2)

In [21]:
!mkdir paraphrase_model

mkdir: cannot create directory ‘paraphrase_model’: File exists


In [22]:
size = quora_data.shape[0]

t1 = int(train_size*size)
t2 = t1 + int(val_size*size)
t3 = t1 + t2 + int(test_size*size)

quora_data[0:t1].to_csv('./paraphrase_model/train.csv', index=False)
quora_data[t1:t2].to_csv('./paraphrase_model/valid.csv', index= False)
quora_data[t2:t3].to_csv('./paraphrase_model/test.csv', index= False)

### Set up transformer

In [23]:
args_dict.update({
    'data_dir': './language_model/', 
    'output_dir': './language_model/result', 
    'num_train_epochs':16,
})

args = argparse.Namespace(**args_dict)
print(json.dumps(args_dict, indent=2))

{
  "data_dir": "./language_model/",
  "output_dir": "./language_model/result",
  "model_name_or_path": "t5-base",
  "tokenizer_name_or_path": "t5-base",
  "max_seq_length": 200,
  "learning_rate": 0.0003,
  "weight_decay": 0.0,
  "adam_epsilon": 1e-08,
  "warmup_steps": 0,
  "train_batch_size": 8,
  "eval_batch_size": 8,
  "num_train_epochs": 16,
  "gradient_accumulation_steps": 32,
  "n_gpu": 1,
  "early_stop_callback": false,
  "fp_16": false,
  "opt_level": "O1",
  "max_grad_norm": 1.0,
  "seed": 42
}


In [24]:
!mkdir language_model/result

mkdir: cannot create directory ‘language_model/result’: File exists


### Set up Transformer Model

In [25]:
# set the right dataset
get_dataset = get_language_model_dataset

# initialize model
language_model = T5FineTuner(args)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json from cache at /home/beyhan/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b
INFO:transformers.configuration_utils:Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size":

### Language Modelling Training

### Training neural network

#### Initialize Trainer

In [26]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    period =1,filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)

trainer = pl.Trainer(**train_params)

INFO:lightning:GPU available: True, used: True
INFO:lightning:CUDA_VISIBLE_DEVICES: [0]


In [27]:
print (" Training Language model")
trainer.fit(language_model)

print ("training finished")

print ("Saving model")
language_model.model.save_pretrained("./language_model/result")

print ("Saved model")

 Training Language model


INFO:lightning:
    | Name                                                                  | Type                       | Params
-----------------------------------------------------------------------------------------------------------------
0   | model                                                                 | T5ForConditionalGeneration | 222 M 
1   | model.shared                                                          | Embedding                  | 24 M  
2   | model.encoder                                                         | T5Stack                    | 109 M 
3   | model.encoder.block                                                   | ModuleList                 | 84 M  
4   | model.encoder.block.0                                                 | T5Block                    | 7 M   
5   | model.encoder.block.0.layer                                           | ModuleList                 | 7 M   
6   | model.encoder.block.0.layer.0                                     

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_val_loss = tensor(0.4997, device='cuda:0')

INFO:__main__:loss = tensor(1.1030, device='cuda:0')

INFO:__main__:train_loss = tensor(1.1030, device='cuda:0')

INFO:__main__:val_loss = tensor(0.4997, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(5.5809, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0786, device='cuda:0')

INFO:__main__:epoch = 0

INFO:__main__:loss = tensor(0.1440, device='cuda:0')

INFO:__main__:train_loss = tensor(0.1440, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0786, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.5781, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0331, device='cuda:0')

INFO:__main__:epoch = 1

INFO:__main__:loss = tensor(0.0805, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0805, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0331, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.1312, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0258, device='cuda:0')

INFO:__main__:epoch = 2

INFO:__main__:loss = tensor(0.0605, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0605, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0258, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0751, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0196, device='cuda:0')

INFO:__main__:epoch = 3

INFO:__main__:loss = tensor(0.0352, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0352, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0196, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0563, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0141, device='cuda:0')

INFO:__main__:epoch = 4

INFO:__main__:loss = tensor(0.0528, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0528, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0141, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0432, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0100, device='cuda:0')

INFO:__main__:epoch = 5

INFO:__main__:loss = tensor(0.0412, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0412, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0100, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0345, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0077, device='cuda:0')

INFO:__main__:epoch = 6

INFO:__main__:loss = tensor(0.0111, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0111, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0077, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0295, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0063, device='cuda:0')

INFO:__main__:epoch = 7

INFO:__main__:loss = tensor(0.0241, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0241, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0063, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0254, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0055, device='cuda:0')

INFO:__main__:epoch = 8

INFO:__main__:loss = tensor(0.0226, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0226, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0055, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0229, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0050, device='cuda:0')

INFO:__main__:epoch = 9

INFO:__main__:loss = tensor(0.0188, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0188, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0050, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0209, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0045, device='cuda:0')

INFO:__main__:epoch = 10

INFO:__main__:loss = tensor(0.0150, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0150, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0045, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0193, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0043, device='cuda:0')

INFO:__main__:epoch = 11

INFO:__main__:loss = tensor(0.0224, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0224, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0043, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0181, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0041, device='cuda:0')

INFO:__main__:epoch = 12

INFO:__main__:loss = tensor(0.0136, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0136, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0041, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0170, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0040, device='cuda:0')

INFO:__main__:epoch = 13

INFO:__main__:loss = tensor(0.0296, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0296, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0040, device='cuda:0')



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0160, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0040, device='cuda:0')

INFO:__main__:epoch = 14

INFO:__main__:loss = tensor(0.0089, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0089, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0040, device='cuda:0')






INFO:transformers.configuration_utils:Configuration saved in ./language_model/result/config.json


training finished
Saving model


INFO:transformers.modeling_utils:Model weights saved in ./language_model/result/pytorch_model.bin


Saved model


### Evaluation

In [28]:
language_model_validation_dataset = LanguageModelDataset(language_model.tokenizer, 'language_model', 'valid')
loader = DataLoader(language_model_validation_dataset, batch_size=32, shuffle=True)

print("Language Model Val dataset: ", len(language_model_validation_dataset))

Language Model Val dataset:  260


In [29]:
it = iter(loader)

batch = next(it)
batch["source_ids"].shape

torch.Size([32, 256])

In [30]:
outs = language_model.model.generate(
    input_ids=batch['source_ids'].cuda(), 
    attention_mask=batch['source_mask'].cuda(), 
    max_length=MAX_SEQ_LENGTH
)

dec = [language_model.tokenizer.decode(ids) for ids in outs]

texts = [language_model.tokenizer.decode(ids) for ids in batch['source_ids']]
targets = [language_model.tokenizer.decode(ids) for ids in batch['target_ids']]

In [31]:
for i in range(32):
    print("Source Statement: %s" % texts[i])
    print("Target Statement: %s" % targets[i])
    print("Predicted Statement: %s" % dec[i])
    print("=====================================================================\n")
    break

Source Statement: Hello There s no such thing as just another INTP so welcome welcome Funny you should say that about ESFJ s I m here because I was professionally tested many years ago and came out INTP but Truth Even your time ask for a hiatus while you deal with stress at work school or say you re not going to answer the phone for a while or unfriend someone on Facesuck and you have disrupted These difficult times they call for difficult measures URL FTW I did know what you meant and I am good at organizing other people as components in a plan to do crazy things but you should see how carefully I plot my itinerary from bed to bathroom to I have known a few and the one thing all of them had in common was a need to keep their environment from changing The one who was not satisfied with the environment was doing everything to create Weird Does it work if it s not negative - i e what we thought we had come to see - or if we change it to what we had not thought we had come to see No joke 

### Set up Transformer Model For Paraphrasing

In [32]:
args_dict.update({
    'data_dir': './paraphrase_model/', 
    'output_dir': './paraphrase_model/result', 
    'num_train_epochs':16,
})

args = argparse.Namespace(**args_dict)
print(json.dumps(args_dict, indent=2))

{
  "data_dir": "./paraphrase_model/",
  "output_dir": "./paraphrase_model/result",
  "model_name_or_path": "t5-base",
  "tokenizer_name_or_path": "t5-base",
  "max_seq_length": 200,
  "learning_rate": 0.0003,
  "weight_decay": 0.0,
  "adam_epsilon": 1e-08,
  "warmup_steps": 0,
  "train_batch_size": 8,
  "eval_batch_size": 8,
  "num_train_epochs": 16,
  "gradient_accumulation_steps": 32,
  "n_gpu": 1,
  "early_stop_callback": false,
  "fp_16": false,
  "opt_level": "O1",
  "max_grad_norm": 1.0,
  "seed": 42
}


In [33]:
!mkdir paraphrase_model/result

mkdir: cannot create directory ‘paraphrase_model/result’: File exists


In [34]:
# set the right dataset
get_dataset = get_paraphrase_dataset

# initialize model
paraphrase_model = language_model

### Paraphrasing Training

#### Initialize Trainer

### Training neural network For paraphrasing

In [35]:
print (" Training Paraphrasing model")
trainer.fit(paraphrase_model)

print ("training finished")

print ("Saving model")
paraphrase_model.model.save_pretrained("./paraphrase_model/result")

print ("Saved model")

 Training Paraphrasing model


INFO:lightning:
    | Name                                                                  | Type                       | Params
-----------------------------------------------------------------------------------------------------------------
0   | model                                                                 | T5ForConditionalGeneration | 222 M 
1   | model.shared                                                          | Embedding                  | 24 M  
2   | model.encoder                                                         | T5Stack                    | 109 M 
3   | model.encoder.block                                                   | ModuleList                 | 84 M  
4   | model.encoder.block.0                                                 | T5Block                    | 7 M   
5   | model.encoder.block.0.layer                                           | ModuleList                 | 7 M   
6   | model.encoder.block.0.layer.0                                     

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0162, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.0127, device='cuda:0')

INFO:__main__:epoch = 15

INFO:__main__:loss = tensor(0.0295, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0295, device='cuda:0')

INFO:__main__:val_loss = tensor(0.0127, device='cuda:0')

INFO:transformers.configuration_utils:Configuration saved in ./paraphrase_model/result/config.json



training finished
Saving model


INFO:transformers.modeling_utils:Model weights saved in ./paraphrase_model/result/pytorch_model.bin


Saved model


### Evaluation

In [36]:
paraphrase_validation_dataset = ParaphraseDataset(paraphrase_model.tokenizer, 'paraphrase_model', 'valid')
loader = DataLoader(paraphrase_validation_dataset, batch_size=32, shuffle=True)

print("Paraphrase Val dataset: ", len(paraphrase_validation_dataset))

Paraphrase Val dataset:  29853


In [37]:
it = iter(loader)

batch = next(it)
batch["source_ids"].shape

torch.Size([32, 256])

In [38]:
outs = paraphrase_model.model.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=MAX_SEQ_LENGTH)

dec = [paraphrase_model.tokenizer.decode(ids) for ids in outs]

texts = [paraphrase_model.tokenizer.decode(ids) for ids in batch['source_ids']]
targets = [paraphrase_model.tokenizer.decode(ids) for ids in batch['target_ids']]

In [39]:
for i in range(32):
    lines = textwrap.wrap("Source Statement:\n%s\n" % texts[i], width=100)
    print("\n".join(lines))
    print("\nTarget Statement: %s" % targets[i])
    print("Predicted statement: %s" % dec[i])
    print("=====================================================================\n")

Source Statement: original: Isn't taxation robbery?

Target Statement: paraphrase: Why isn't taxation stealing?
Predicted statement: : "isn't taxation robbery?" ": isn't it robbery?"

Source Statement: original: How am I supposed to get back into my Facebook if I forgot my email &
password? I just want it deleted?

Target Statement: paraphrase: I forgot my password and also my email password. how can I get back that account?
Predicted statement: : How am I supposed to get back into my Facebook if I forgot my email & password? I just want it deleted?

Source Statement: original: I ejaculate very early while masturbating will it be same while having
sex?

Target Statement: paraphrase: I ejaculate very early while masturbating, will it be same while having sex?
Predicted statement: I ejaculate very early while masturbating will it be same while having sex?

Source Statement: original: What are the best tasting Herbalife shakes and how are they made?

Target Statement: paraphrase: What are

In [40]:
test = pd.DataFrame({
    "source":[
              "I love to work in the garden on sundays"
    ]
})

test["target"] = ""
test.head()

Unnamed: 0,source,target
0,I love to work in the garden on sundays,


### Testing

In [41]:
paraphrase_test_dataset = ParaphraseDataset(paraphrase_model.tokenizer, 'paraphrase_model', 'test')
loader = DataLoader(paraphrase_test_dataset, batch_size=32, shuffle=True)

print("Paraphrase Test dataset: ", len(paraphrase_test_dataset))

Paraphrase Test dataset:  14928


In [42]:
it = iter(loader)

batch = next(it)
batch["source_ids"].shape

torch.Size([32, 256])

In [43]:
outs = paraphrase_model.model.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=MAX_SEQ_LENGTH)

dec = [paraphrase_model.tokenizer.decode(ids) for ids in outs]

texts = [paraphrase_model.tokenizer.decode(ids) for ids in batch['source_ids']]
targets = [paraphrase_model.tokenizer.decode(ids) for ids in batch['target_ids']]

In [44]:
for i in range(32):
    print("Source Statement: %s" % texts[i])
    print("\nTarget Statement: %s" % targets[i])
    print("Predicted statement: %s" % dec[i])
    print("=====================================================================\n")

Source Statement: original: How could I improve my English pronunciation?

Target Statement: paraphrase: How can l improve my English??
Predicted statement: How could I improve my English pronunciation? original: How could I improve my English pronunciation?

Source Statement: original: Should people have the right to take their own life?

Target Statement: paraphrase: Should a person have a right to end their own life if they want to?
Predicted statement: Should people have the right to take their own life?

Source Statement: original: What is the Shroud of Turin?

Target Statement: paraphrase: What do you think about the Shroud of Turin?
Predicted statement: What is the Shroud of Turin?

Source Statement: original: What are some famous ESL learners?

Target Statement: paraphrase: Who are some famous esl learners?
Predicted statement: Original: What are some famous ESL learners? What are some famous ESL learners?

Source Statement: original: How exactly is the proposed GST bill benefi