In [131]:
!nvidia-smi

Thu Aug  6 23:55:02 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.57       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    32W / 250W |    885MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## GPT2 fine tuning

In [1]:
# Install dependencies
!pip install transformers pandas numpy torch tensorboard -qq

In [None]:
!pip install pytorch_lightning

In [132]:
# imports 
import random, os, json

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import textwrap, logging, argparse

import torch
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
pl.__version__

from transformers import (
    AdamW, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup
)

### Set up Transformer neural network

In [133]:
MODEL_NAME = "t5-small"
args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path=MODEL_NAME,
    tokenizer_name_or_path=MODEL_NAME,
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=32,
    eval_batch_size=32,
    num_train_epochs=2,
    gradient_accumulation_steps=32,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)


In [134]:
class ParaphraseDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=512):
        self.path = os.path.join("./", data_dir, type_path + '.csv')

        self.source_column = "source"
        self.target_column = "target"
        self.data = pd.read_csv(self.path)

        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        for idx in range(len(self.data)):
            target, source= self.data.loc[idx, self.target_column], self.data.loc[idx, self.source_column]

            input_ = "Phrase: %s </s>" % (source)
            target = "Target: %s </s>" %(target)

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, truncation=True, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, truncation=True, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [135]:
class LanguageModelDataset(ParaphraseDataset):
    def _build(self):
        for idx in range(len(self.data)):
            target, source= self.data.loc[idx, self.target_column], self.data.loc[idx, self.source_column]

            input_ = "%s </s>" % (source)
            target = "%s </s>" %(target)

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, truncation=True, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, truncation=True, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [136]:
def clean_prediction(text):
    token = '<|endoftext|>'
    text = text.replace(token, '')
    text = text.strip()
    if text[-1] == '"' and text.count('"') % 2: text = text[:-1]
    return text.strip()

def get_language_model_dataset(tokenizer, type_path, args):
    return LanguageModelDataset(
        tokenizer=tokenizer, 
        data_dir=args.data_dir, 
        type_path=type_path,  
        max_len=args.max_seq_length
    )

def get_paraphrase_dataset(tokenizer, type_path, args):
    return ParaphraseDataset(
        tokenizer=tokenizer, 
        data_dir=args.data_dir, 
        type_path=type_path,  
        max_len=args.max_seq_length
    )

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [137]:
class T5ParaphraserFineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5ParaphraserFineTuner, self).__init__()
        self.hparams = hparams

        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)

    def is_logger(self):
        return True

    def forward(
            self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            lm_labels=lm_labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
        return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        return {"val_loss": loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
        if self.trainer.use_tpu:
            xm.optimizer_step(optimizer)
        else:
            optimizer.step()
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict

    def train_dataloader(self):
        train_dataset = get_paraphrase_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True,
                                num_workers=4)
        t_total = (
                (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                // self.hparams.gradient_accumulation_steps
                * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_paraphrase_dataset(tokenizer=self.tokenizer, type_path="valid", args=self.hparams)
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [138]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
        def on_validation_end(self, trainer, pl_module):
            logger.info("***** Validation results *****")
            if pl_module.is_logger():
                  metrics = trainer.callback_metrics
                  # Log results
                  for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                      logger.info("{} = {}\n".format(key, str(metrics[key])))

        def on_test_end(self, trainer, pl_module):
            logger.info("***** Test results *****")

            if pl_module.is_logger():
                metrics = trainer.callback_metrics

                  # Log and save results to file
                output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
                with open(output_test_results_file, "w") as writer:
                    for key in sorted(metrics):
                          if key not in ["log", "progress_bar"]:
                            logger.info("{} = {}\n".format(key, str(metrics[key])))
                            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [139]:
class T5LanguageModelerFineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5LanguageModelerFineTuner, self).__init__()
        self.hparams = hparams

        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)

    def is_logger(self):
        return True

    def forward(
            self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
    ):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            lm_labels=lm_labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]

        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
        return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        return {"val_loss": loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
        if self.trainer.use_tpu:
            xm.optimizer_step(optimizer)
        else:
            optimizer.step()
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict
        
    def train_dataloader(self):
        train_dataset = get_language_model_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True,
                                num_workers=4)
        t_total = (
                (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                // self.hparams.gradient_accumulation_steps
                * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_language_model_dataset(tokenizer=self.tokenizer, type_path="valid", args=self.hparams)
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

## Load Data

### Loading Quora Data 

In [140]:
DATA_PATH = "."

if not os.path.exists("q_quora.csv"):
  from google.colab import drive
  drive.mount('/content/drive')

  DATA_PATH = "./drive/My Drive/paraphrase"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [141]:
quora_data = pd.read_csv(f"{DATA_PATH}/q_quora.csv", dtype=str)
quora_data = quora_data.loc[quora_data['is_duplicate']=='1']
quora_data = quora_data.drop([
    'id','qid1', 'qid2','is_duplicate','Unnamed: 6', 'Unnamed: 7', 
    'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12'
], axis=1)

quora_data = quora_data.reset_index(drop=True)
quora_data.columns= ['source', 'target']

quora_data = quora_data.sample(frac=1).reset_index(drop=True)
quora_data.head()

Unnamed: 0,source,target
0,Self employment tax?,What is self employment tax?
1,What are some good ways to improve English voc...,What is the easiest way to improve my vocabulary?
2,What is your motivation in your daily life?,What motivates you in your daily life?
3,Which website shows how much internet companie...,Which website shows how much internet companie...
4,How do current autonomous vehicles work?,How do autonomous car work?


### Loading MBTI data

In [188]:
mbti_data = pd.read_csv(f"{DATA_PATH}/mbti_1.csv")

print("All personality types")
print("========================")
print(pd.unique(mbti_data["type"]))

personality_type = "INTJ"
mbti_data = mbti_data[mbti_data["type"] == personality_type]

print("\n=> Currently using", personality_type)

del mbti_data["type"]
mbti_data["source"] = mbti_data["posts"]
mbti_data.columns= ['source', 'target']
mbti_data.head()

All personality types
['INFJ' 'ENTP' 'INTP' 'INTJ' 'ENTJ' 'ENFJ' 'INFP' 'ENFP' 'ISFP' 'ISTP'
 'ISFJ' 'ISTJ' 'ESTP' 'ESFP' 'ESTJ' 'ESFJ']

=> Currently using INTJ


Unnamed: 0,source,target
3,"'Dear INTP, I enjoyed our conversation the o...","'Dear INTP, I enjoyed our conversation the o..."
5,'18/37 @.@|||Science is not perfect. No scien...,'18/37 @.@|||Science is not perfect. No scien...
7,'I tend to build up a collection of things on ...,'I tend to build up a collection of things on ...
13,"'Fair enough, if that's how you want to look a...","'Fair enough, if that's how you want to look a..."
36,"'Poker face for sure, accompanied by some sarc...","'Poker face for sure, accompanied by some sarc..."


### Prep Data

In [143]:
mbti_data.shape

(8675, 2)

In [144]:
!mkdir language_model

mkdir: cannot create directory ‘language_model’: File exists


In [145]:
mbti_data[:8_000].to_csv('./language_model/train.csv', index=False)
mbti_data[8_000:].to_csv('./language_model/valid.csv', index= False)

In [146]:
quora_data.shape

(149267, 2)

In [147]:
!mkdir paraphrase_model

mkdir: cannot create directory ‘paraphrase_model’: File exists


In [148]:
quora_data[:100_001].to_csv('./paraphrase_model/train.csv', index=False)
quora_data[100_001:].to_csv('./paraphrase_model/valid.csv', index= False)

### Set up transformer

In [167]:
args_dict.update({
    'data_dir': './language_model/', 
    'output_dir': './language_model/result', 
    'num_train_epochs':2,
    'max_seq_length':256,
})

args = argparse.Namespace(**args_dict)
print(json.dumps(args_dict, indent=2))

{
  "data_dir": "./language_model/",
  "output_dir": "./language_model/result",
  "model_name_or_path": "t5-small",
  "tokenizer_name_or_path": "t5-small",
  "max_seq_length": 256,
  "learning_rate": 0.0003,
  "weight_decay": 0.0,
  "adam_epsilon": 1e-08,
  "warmup_steps": 0,
  "train_batch_size": 32,
  "eval_batch_size": 32,
  "num_train_epochs": 2,
  "gradient_accumulation_steps": 32,
  "n_gpu": 1,
  "early_stop_callback": false,
  "fp_16": false,
  "opt_level": "O1",
  "max_grad_norm": 1.0,
  "seed": 42,
  "gpus": 1
}


In [168]:
!mkdir language_model/result

mkdir: cannot create directory ‘language_model/result’: File exists


### Set up Transformer Model

In [170]:
language_model = T5LanguageModelerFineTuner(args)

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Language Modelling Training

### Training neural network

#### Initialize Trainer

In [171]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    period =1,filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)

language_model_trainer = pl.Trainer(**train_params)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


In [172]:
print (" Training Language model")
language_model_trainer.fit(language_model)

print ("training finished")

print ("Saving model")
language_model.model.save_pretrained("/language_model/result")

print ("Saved model")

 Training Language model
training finished
Saving model
Saved model


### Evaluation

In [173]:
language_model_validation_dataset = LanguageModelDataset(language_model.tokenizer, 'language_model', 'valid')
loader = DataLoader(language_model_validation_dataset, batch_size=32, shuffle=True)

print("Language Model Val dataset: ", len(language_model_validation_dataset))

Language Model Val dataset:  675


In [174]:
it = iter(loader)

batch = next(it)
batch["source_ids"].shape

torch.Size([32, 512])

In [176]:
outs = language_model.model.generate(
    input_ids=batch['source_ids'], 
    attention_mask=batch['source_mask'], 
    max_length=2
)

dec = [language_model.tokenizer.decode(ids) for ids in outs]

texts = [language_model.tokenizer.decode(ids) for ids in batch['source_ids']]
targets = [language_model.tokenizer.decode(ids) for ids in batch['target_ids']]

In [177]:
for i in range(32):
    lines = textwrap.wrap("Source Statement:\n%s\n" % texts[i], width=100)
    print("\n".join(lines))
    print("\nTarget Statement: %s" % targets[i])
    print("Predicted Statement: %s" % dec[i])
    print("=====================================================================\n")
    break

Source Statement: 'It really depends. For the most part, I am uncomfortable with sharing emotions
(to the point many others often comment that my face is unreadable), but I do think that is
essential to communicate...|||At this point in time? I most definitely don't want to have children,
and I look on at children in disdain (screaming fits and grubby fingers are not stuff I'd enjoy).
However, I'm not crossing it...|||Haha, it's great that there is a couple of people willing to
listen to you! The only people I've only gone in deep MBTI conversations with are ENFP, ISTJ, and
another INTJ. Others I've met don't care...|||Nowadays, my dislike of people has been rising
exponentially as social conflicts pile up over the years. My ISTJ mother and I have been getting
into a lot more fights (she seems to only care for...|||I definitely do that all the time! I retype
my friends all the time (i.e my ISFJ friend got ISFP at first before finally agreeing with me that
she's an ISFJ, and I'm convinc

### Set up Transformer Model For Paraphrasing

In [178]:
args_dict.update({
    "model_name_or_path": "./language_model/result",
    "tokenizer_name_or_path": "./language_model/result",
    'data_dir': './paraphrase_model/', 
    'output_dir': './paraphrase_model/result', 
    'num_train_epochs':2,
    'max_seq_length':256
})

args = argparse.Namespace(**args_dict)
print(json.dumps(args_dict, indent=2))

{
  "data_dir": "./paraphrase_model/",
  "output_dir": "./paraphrase_model/result",
  "model_name_or_path": "t5-small",
  "tokenizer_name_or_path": "t5-small",
  "max_seq_length": 256,
  "learning_rate": 0.0003,
  "weight_decay": 0.0,
  "adam_epsilon": 1e-08,
  "warmup_steps": 0,
  "train_batch_size": 32,
  "eval_batch_size": 32,
  "num_train_epochs": 2,
  "gradient_accumulation_steps": 32,
  "n_gpu": 1,
  "early_stop_callback": false,
  "fp_16": false,
  "opt_level": "O1",
  "max_grad_norm": 1.0,
  "seed": 42,
  "gpus": 1
}


In [119]:
!mkdir paraphrase_model/result

mkdir: cannot create directory ‘paraphrase_model/result’: File exists


In [179]:
paraphrase_model = T5ParaphraserFineTuner(args)

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Paraphrasing Training

#### Initialize Trainer

In [180]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    period =1,filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)

trainer = pl.Trainer(**train_params)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


### Training neural network

In [181]:
print (" Training Paraphrasing model")
trainer.fit(paraphrase_model)

print ("training finished")

print ("Saving model")
paraphrase_model.model.save_pretrained("/paraphrase_model/result")

print ("Saved model")

 Training Paraphrasing model
training finished
Saving model
Saved model


### Evaluation

In [182]:
paraphrase_validation_dataset = LanguageModelDataset(paraphrase_model.tokenizer, 'paraphrase_model', 'valid')
loader = DataLoader(paraphrase_validation_dataset, batch_size=32, shuffle=True)

print("Paraphrase Val dataset: ", len(paraphrase_validation_dataset))

Paraphrase Val dataset:  49266


In [183]:
it = iter(loader)

batch = next(it)
batch["source_ids"].shape

torch.Size([32, 512])

In [184]:
outs = paraphrase_model.model.generate(input_ids=batch['source_ids'], 
                              attention_mask=batch['source_mask'], 
                              max_length=2)

dec = [paraphrase_model.tokenizer.decode(ids) for ids in outs]

texts = [paraphrase_model.tokenizer.decode(ids) for ids in batch['source_ids']]
targets = [paraphrase_model.tokenizer.decode(ids) for ids in batch['target_ids']]

In [185]:
for i in range(32):
    lines = textwrap.wrap("Source Statement:\n%s\n" % texts[i], width=100)
    print("\n".join(lines))
    print("\nTarget Statement: %s" % targets[i])
    print("Predicted statement: %s" % dec[i])
    print("=====================================================================\n")

Source Statement: How much does it cost to trademark a slogan in the US?

Target Statement: How much does it cost to register a trademark in the U.S.?
Predicted statement: Wie

Source Statement: How do you to manage time effectively?

Target Statement: What should I do to manage my time?
Predicted statement: Wie

Source Statement: What is something really weird about you?

Target Statement: I have webbed toes, what's weird about you?
Predicted statement: Was

Source Statement: What is the rationale behind introducing 2000 rupee notes?

Target Statement: Is required 2000 Rs notes?
Predicted statement: Wie

Source Statement: What are your views on Cyrus Mistry being removed as Chairperson of Tata Sons?

Target Statement: What are the prospect reasons of Cyrus Mistry being fired by TATA sons?
Predicted statement: Was

Source Statement: How can I earn money in YouTube?

Target Statement: How can I earn money using YouTube?
Predicted statement: Wie

Source Statement: Which reference books a