In [None]:
!nvidia-smi

### installing modules

In [None]:
# Install dependencies
!pip install transformers==2.9.0 -qq

In [None]:
!pip install pandas numpy torch tensorboard beautifulsoup4 -qq

In [None]:
!pip install pytorch_lightning --qq

In [None]:
# imports 
import random, os, json, re

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import textwrap, logging, argparse
from bs4 import BeautifulSoup

import torch
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
pl.__version__

from transformers import (
    AdamW, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup
)

### Set up Transformer neural network COnfig Args

In [None]:
args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path="t5-small",
    tokenizer_name_or_path="t5-small",
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=32,
    eval_batch_size=32,
    num_train_epochs=2,
    gradient_accumulation_steps=32,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)


In [None]:
def clean_text(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', r'<URL>', text)
    text = re.sub(r'http\S+', r'<URL>', text)
    text = re.sub(r"[^a-zA-Z-]", r" ", text)
    text = re.sub(r'\|\|\|', r' ', text)
    text = re.sub(r'\'\'', r' ', text)
    text = re.sub(r"^'", r' ', text)
    text = re.sub(r"'$", r' ', text)
    text = re.sub(r' +', r' ', text)
    return text.strip()

In [None]:
class ParaphraseDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, max_len=256):
        self.path = os.path.join("./", data_dir, type_path + '.csv')

        self.source_column = "source"
        self.target_column = "target"
        self.data = pd.read_csv(self.path)

        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        for idx in range(len(self.data)):
            target, source= self.data.loc[idx, self.target_column], self.data.loc[idx, self.source_column]

            use_prefix = False

            if use_prefix:
              input_ = "original: %s </s>" % (source)
              target = "paraphrase: %s </s>" %(target)

            else:
              input_ = "%s </s>" % (source)
              target = "%s </s>" %(target)
              
            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, truncation=True, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, truncation=True, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [None]:
class LanguageModelDataset(ParaphraseDataset):
    def _build(self):
        for idx in range(len(self.data)):
            use_tokens = False
            target_text, source_text= self.data.loc[idx, self.target_column], self.data.loc[idx, self.source_column]
            
            if use_tokens:
              source = source_text.split()
              source_size = len(source)
              size = int(source_size / 3)
              if size > 100:
                size = 100

              cursor = 0
              input_, target = "", ""
              random_masks = np.random.randint(low=1, high=4, size=size)

              if source_size:
                target = "<extra_id_1>"

              for index, rm in enumerate(random_masks[:-1:3]):
                if cursor+random_masks[index+1]+rm >= source_size:
                  break
                input_ = " ".join([input_, " ".join(source[cursor:cursor+rm]), f" <extra_id_{index+1}> "])
                cursor += rm
                target = " ".join([target, " ".join(source[cursor:cursor+random_masks[index+1]]), f" <extra_id_{index+2}> "])
                cursor += random_masks[index+1]

              # input_ = " ".join([input_, " </s>"]).strip()
              target = " ".join([target, "</s>"]).strip()

            else:
              input_ = "%s </s>" % (source_text)
              target = "%s </s>" %(target_text)

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len, truncation=True, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len, truncation=True, pad_to_max_length=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [None]:
def clean_prediction(text):
    token = '<|endoftext|>'
    text = text.replace(token, '')
    text = text.strip()
    if text[-1] == '"' and text.count('"') % 2: text = text[:-1]
    return text.strip()

def get_language_model_dataset(tokenizer, type_path, args):
    return LanguageModelDataset(
        tokenizer=tokenizer, 
        data_dir=args.data_dir, 
        type_path=type_path,  
        max_len=args.max_seq_length
    )

def get_paraphrase_dataset(tokenizer, type_path, args):
    return ParaphraseDataset(
        tokenizer=tokenizer, 
        data_dir=args.data_dir, 
        type_path=type_path,  
        max_len=args.max_seq_length
    )

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
        def on_validation_end(self, trainer, pl_module):
            logger.info("***** Validation results *****")
            if pl_module.is_logger():
                  metrics = trainer.callback_metrics
                  # Log results
                  for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                      logger.info("{} = {}\n".format(key, str(metrics[key])))

        def on_test_end(self, trainer, pl_module):
            logger.info("***** Test results *****")

            if pl_module.is_logger():
                metrics = trainer.callback_metrics

                  # Log and save results to file
                output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
                with open(output_test_results_file, "w") as writer:
                    for key in sorted(metrics):
                          if key not in ["log", "progress_bar"]:
                            logger.info("{} = {}\n".format(key, str(metrics[key])))
                            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [None]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        self.hparams = hparams

        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)

    def is_logger(self):
        return True

    def forward(
            self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
    ):
        return self(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            lm_labels=lm_labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs[0]
        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)

        tensorboard_logs = {"train_loss": loss}
        return {"loss": loss, "log": tensorboard_logs}

    def training_epoch_end(self, outputs):
        avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
        tensorboard_logs = {"avg_train_loss": avg_train_loss}
        return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        return {"val_loss": loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss}
        return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
        if self.trainer.use_tpu:
            xm.optimizer_step(optimizer)
        else:
            optimizer.step()
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict
        
    def train_dataloader(self):
        train_dataset = get_language_model_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
        dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True,
                                num_workers=4)
        t_total = (
                (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                // self.hparams.gradient_accumulation_steps
                * float(self.hparams.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        val_dataset = get_language_model_dataset(tokenizer=self.tokenizer, type_path="valid", args=self.hparams)
        return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

## Load Data

### Loading Quora Data 

In [None]:
DATA_PATH = "."

if not os.path.exists("q_quora.csv"):
  from google.colab import drive
  drive.mount('/content/drive')

  DATA_PATH = "./drive/My Drive/paraphrase"

In [None]:
quora_data = pd.read_csv(f"{DATA_PATH}/q_quora.csv", dtype=str)
quora_data = quora_data.loc[quora_data['is_duplicate']=='1']
quora_data = quora_data.drop([
    'id','qid1', 'qid2','is_duplicate','Unnamed: 6', 'Unnamed: 7', 
    'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12'
], axis=1)

quora_data = quora_data.reset_index(drop=True)
quora_data.columns= ['source', 'target']

quora_data = quora_data.sample(frac=1).reset_index(drop=True)
quora_data.head()

### Loading MBTI data

In [None]:
mbti_data = pd.read_csv(f"{DATA_PATH}/mbti_1.csv")

print("All personality types")
print("========================")
print(pd.unique(mbti_data["type"]))

personality_type = "INTJ"
mbti_data = mbti_data[mbti_data["type"] == personality_type]

print("\n=> Currently using", personality_type)

del mbti_data["type"]
mbti_data["posts"] = mbti_data["posts"].apply(clean_text)
mbti_data["target"] = mbti_data["posts"]
mbti_data.columns= ['source', 'target']
mbti_data.head()

### Prep Data

In [None]:
mbti_data.shape

In [None]:
!mkdir language_model

In [None]:
mbti_data[:800].to_csv('./language_model/train.csv', index=False)
mbti_data[800:].to_csv('./language_model/valid.csv', index= False)

In [None]:
quora_data.shape

In [None]:
!mkdir paraphrase_model

In [None]:
quora_data[:100_001].to_csv('./paraphrase_model/train.csv', index=False)
quora_data[100_001:].to_csv('./paraphrase_model/valid.csv', index= False)

### Set up transformer

In [None]:
args_dict.update({
    'model_name_or_path':"t5-base",
    'tokenizer_name_or_path':"t5-base",
    'data_dir': './language_model/', 
    'output_dir': './language_model/result', 
    'num_train_epochs':2,
    'max_seq_length':180,
})

args = argparse.Namespace(**args_dict)
print(json.dumps(args_dict, indent=2))

In [None]:
!mkdir language_model/result

### Set up Transformer Model

In [None]:
language_model = T5FineTuner(args)

### Language Modelling Training

### Training neural network

#### Initialize Trainer

In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    period =1,filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)

trainer = pl.Trainer(**train_params)

In [None]:
print (" Training Language model")
trainer.fit(language_model)

print ("training finished")

print ("Saving model")
language_model.model.save_pretrained("./language_model/result")

print ("Saved model")

### Evaluation

In [None]:
language_model_validation_dataset = LanguageModelDataset(language_model.tokenizer, 'language_model', 'valid')
loader = DataLoader(language_model_validation_dataset, batch_size=32, shuffle=True)

print("Language Model Val dataset: ", len(language_model_validation_dataset))

In [None]:
it = iter(loader)

batch = next(it)
batch["source_ids"].shape

In [None]:
outs = language_model.model.generate(
    input_ids=batch['source_ids'],#.cuda(), 
    attention_mask=batch['source_mask'],#.cuda(), 
    max_length=128
)

dec = [language_model.tokenizer.decode(ids) for ids in outs]

texts = [language_model.tokenizer.decode(ids) for ids in batch['source_ids']]
targets = [language_model.tokenizer.decode(ids) for ids in batch['target_ids']]

In [None]:
for i in range(32):
    print("Source Statement: %s" % texts[i])
    print("Target Statement: %s" % targets[i])
    print("Predicted Statement: %s" % dec[i])
    print("=====================================================================\n")

### Set up Transformer Model For Paraphrasing

In [None]:
!mkdir paraphrase_model/result

In [None]:
# paraphrase_model = T5ParaphraserFineTuner(args)
paraphrase_model = language_model

### Paraphrasing Training

#### Initialize Trainer

### Training neural network For paraphrasing

In [None]:
print (" Training Paraphrasing model")
trainer.fit(paraphrase_model)

print ("training finished")

print ("Saving model")
paraphrase_model.model.save_pretrained("./paraphrase_model/result")

print ("Saved model")

### Evaluation

In [None]:
paraphrase_validation_dataset = ParaphraseDataset(paraphrase_model.tokenizer, 'paraphrase_model', 'valid')
loader = DataLoader(paraphrase_validation_dataset, batch_size=32, shuffle=True)

print("Paraphrase Val dataset: ", len(paraphrase_validation_dataset))

In [None]:
it = iter(loader)

batch = next(it)
batch["source_ids"].shape

In [None]:
outs = paraphrase_model.model.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=128)

dec = [paraphrase_model.tokenizer.decode(ids) for ids in outs]

texts = [paraphrase_model.tokenizer.decode(ids) for ids in batch['source_ids']]
targets = [paraphrase_model.tokenizer.decode(ids) for ids in batch['target_ids']]

In [None]:
for i in range(32):
    lines = textwrap.wrap("Source Statement:\n%s\n" % texts[i], width=100)
    print("\n".join(lines))
    print("\nTarget Statement: %s" % targets[i])
    print("Predicted statement: %s" % dec[i])
    print("=====================================================================\n")