In [106]:
# Structured DVP

In [36]:
# *********** IMP ************

# Remove breaks frm training and validation loops
# Change limit_examples argument value while creating train_dataset & val dataset and set it to None
# While calculating the average train_loss; remove +1 from global_step (in denominator) while deletion
# While saving the last model state, from that cell, remove global_step = 3
# In training arguments; set model_name to gpt2-large
# Change name of directories accordingly

In [1]:
# Desired Imports
import torch
import tqdm
from tqdm import trange
from transformers import (AdamW, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, get_linear_schedule_with_warmup)
from torch.utils.data import Dataset
import pickle
import numpy as np
from collections import defaultdict
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import os
import shutil
import subprocess
import json
import torch.nn.utils as F
from transformers import WEIGHTS_NAME
import glob

In [2]:
# Directories needed
paranmt_dataset_dir = "/content/drive/MyDrive/IRE_Project/style_transfer_paraphrase/datasets/paranmt_filtered"
paraphrase_model_chkpts_dir =  "/content/drive/MyDrive/IRE/DVP"
final_paraphrase_model_dir = "/content/drive/MyDrive/IRE/DVP/final_DVP"

In [3]:
# Required Arguments
args_dir = {
  "save_steps" : 20, # can be changed
  "num_epochs" : 3,
  "gradient_accumulation_steps" : 2,
  "adam_epsilon" : 1e-8,
  "warmup_steps" : 0,
  "learning_rt" : 5e-5,
  "max_grad_norm" : 1.0,
  "data_dir" : paranmt_dataset_dir,
  "model_type" : "gpt2",
  "model_name" : "gpt2",  # set to gtp2-large
  "train_batch_size" : 5,
  "eval_batch_size" : 5,
  "extra_embedding_dim" : 768,
  "global_dense_feature_list" : None # in file it will be saved with the value null; while reading take care of this thing
}

model_type = args_dir["model_type"]
model_name = args_dir["model_name"]
data_dir = args_dir["data_dir"]
save_steps = args_dir["save_steps"]
num_epochs = args_dir["num_epochs"]
gradient_accumulation_steps = args_dir["gradient_accumulation_steps"]
adam_epsilon = args_dir["adam_epsilon"]
warmup_steps = args_dir["warmup_steps"]
train_batch_size = args_dir["train_batch_size"]
eval_batch_size = args_dir["eval_batch_size"]
learning_rt = args_dir["learning_rt"]
extra_embedding_dim = args_dir["extra_embedding_dim"] # Size of linear layer used for projecting extra embeddings.
global_dense_feature_list = args_dir["global_dense_feature_list"]
max_grad_norm = args_dir["max_grad_norm"]

In [4]:
# Choose device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpus = torch.cuda.device_count()

print("Device- ", device)
print("No. of GPUs- ", n_gpus)

Device-  cuda
No. of GPUs-  1


In [5]:
# Install Transformers
!pip install transformers



In [6]:
# Initialize model classes variables
MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
}
config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]

print("GPT2 Config class- ", config_class)
print("GPT2 Model class- ", model_class)
print("GPT2 Tokenizer class- ", tokenizer_class)

GPT2 Config class-  <class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'>
GPT2 Model class-  <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
GPT2 Tokenizer class-  <class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>


In [7]:
# Init config
config = config_class.from_pretrained(model_name)
print("GPT2Config loaded")

# Init model
model = model_class.from_pretrained(model_name, config = config)
print("GPT2LMHeadModel loaded")

# Init tokenizer
tokenizer = tokenizer_class.from_pretrained(model_name,do_lower_case = False)
print("GPT2Tokenizer loaded")

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

GPT2Config loaded


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel loaded


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

GPT2Tokenizer loaded


In [8]:
# adding extra_dimension to config --- No need in DVP ---- Can be removed later
config.extra_embedding_dim = extra_embedding_dim # don't know why we are using it; -- explore it

In [9]:
# Add special tokens to tokenizer
SPECIAL_TOKENS = {
    "additional_special_tokens": ["<segment_1>", "<segment_2>"],
    "pad_token": "<pad>",
    "bos_token": "<bos>",
    "eos_token": "<eos>"
}
tokenizer.add_special_tokens(SPECIAL_TOKENS)
print("Special Tokens addded to tokenizer")

print("Total tokens- ", len(tokenizer))

Special Tokens addded to tokenizer
Total tokens-  50262


In [10]:
# resize token embedding matrix to take care of special tokens added
model.resize_token_embeddings(len(tokenizer)) # each token of size-> 1280(gpt2-large), 768(gpt2)

Embedding(50262, 768)

In [11]:
# move model to device
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50262, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50262, bias=False)
)

In [12]:
# Init configs

MAX_PARAPHRASE_LEN = 100

# mainly to handle input
INPUT_FORMAT_CONFIG = {
    "keys": [
        {"key": "sent1_tokens", "position": 3},
        {"key": "sent2_tokens", "position": 4}
    ],
    "max_prefix_length": int(MAX_PARAPHRASE_LEN / 2),
    "max_suffix_length": int(MAX_PARAPHRASE_LEN / 2)
}

In [13]:
# Fn to convert example input to dictionary
def input_to_dict(config, sample, tokenizer):
    example = {}

    for inp_key in config["keys"]:
        val = sample[inp_key["position"]]
        example[inp_key["key"]] = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(val))

    return example

In [14]:
# Preprocess input from paranmt
def preprocess(exp, tokenizer, config, do_tokenize=True):
  MASK_TOKEN_ID = -100

  max_prefix_len = config["max_prefix_length"]
  max_suffix_len = config["max_suffix_length"]

  if do_tokenize:
    sent1 = np.array(exp["sent1_tokens"])
    sent2 = np.array(exp["sent2_tokens"])

  # truncate
  if(len(sent1) > max_prefix_len):
    sent1 = sent1[:max_prefix_len]

  if(len(sent2) > max_suffix_len):
    sent2 = sent2[:max_suffix_len]

  # add padding; left padding to prefix and right padding to suffix
  count_pad_tokens_prefix = max_prefix_len - len(sent1)
  sent1 = np.pad(sent1, (count_pad_tokens_prefix, 0), constant_values = tokenizer.pad_token_id)

  # add <eos> to suffix
  sent2 = np.append(sent2, tokenizer.eos_token_id)

  count_pad_tokens_suffix = (max_suffix_len + 1) - len(sent2)
  sent2 = np.pad(sent2, (0, count_pad_tokens_suffix), constant_values = tokenizer.pad_token_id)

  # sentence to input gpt2
  sentence_to_input_gpt2 = np.concatenate([sent1, [tokenizer.bos_token_id], sent2]).astype(np.int64) # [sent1, <bos> sent2]

  # label/gt to predict; -100 used for masking that input (in ground truth only)
  gt = np.concatenate([
      [MASK_TOKEN_ID for _ in sent1],
      [MASK_TOKEN_ID],
      [val if val != tokenizer.pad_token_id else MASK_TOKEN_ID for val in sent2]
  ]).astype(np.int64)

  # segment
  segment = np.concatenate([
      [tokenizer.additional_special_tokens_ids[0] for _ in sent1],
      [tokenizer.additional_special_tokens_ids[1]],
      [tokenizer.additional_special_tokens_ids[1] for _ in sent2]
  ]).astype(np.int64)

  exp["prefix_sent"] = sent1
  exp["suffix_sent"] = sent2

  exp["input"] = sentence_to_input_gpt2
  exp["label"] = gt
  exp["segment"] = segment

  return exp

In [15]:
# DVP dataset
class Diverse_Paraphraser_Dataset(Dataset):
    def __init__(self, paranmt_dataset_dir, config, tokenizer, limit_examples = None, evaluate = False, split_type = "train"):
      self.config = config
      self.examples = []
      file_to_read = paranmt_dataset_dir + "/" +split_type + ".pickle"

      with open(file_to_read, "rb") as data_to_read:
        split_data = pickle.load(data_to_read)

      print("\n\n Original raw data in loaded pickle file- ", split_data[0])

      print("\n\n Converting samples to dictionary form...")
      self.examples = [input_to_dict(self.config, sample, tokenizer) for sample in tqdm.tqdm(split_data)]

      print("\n\n After conversion- ", self.examples[0])

      # Reduce dataset if required
      if limit_examples != None:
        self.examples = self.examples[:limit_examples]

      print("\n\n Doing Preprocess each sample")
      # do Preprocessing in each of the converted samples
      self.examples = [preprocess(exp, tokenizer, self.config, do_tokenize = True) for exp in self.examples]

      print("\n\n After preprocessing- ", self.examples[0])

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
      sentence = self.examples[idx]["input"]
      label = self.examples[idx]["label"]
      segment = self.examples[idx]["segment"]
      context_len = self.config["max_prefix_length"] + 1 # (+1) for <bos>

      return {
          "sample_number": idx,
          "sentence": torch.tensor(sentence),
          "label": torch.tensor(label),
          "segment": torch.tensor(segment)
      }

In [16]:
# create DVP dataset
train_dataset = Diverse_Paraphraser_Dataset(paranmt_dataset_dir,
                                            INPUT_FORMAT_CONFIG,
                                            tokenizer,
                                            limit_examples = 10,
                                            evaluate = False, split_type = "train")
print("\n\n DVP Dataset created")



 Original raw data in loaded pickle file-  ('S > NP VP .', 'S > SBAR , NP VP .', False, 'Mr. Whetstone is goingto speak to you after I finish.', "after I'm done, Mr. Whetstone will be speaking.", (0.3, 0.375, 0.33333333333333326), (-0.33333333333333337, False), 2, ('en', 'en'))


 Converting samples to dictionary form...


100%|██████████| 73062/73062 [00:43<00:00, 1676.77it/s]




 After conversion-  {'sent1_tokens': [5246, 13, 370, 3202, 6440, 318, 1016, 1462, 2740, 284, 345, 706, 314, 5461, 13], 'sent2_tokens': [8499, 314, 1101, 1760, 11, 1770, 13, 370, 3202, 6440, 481, 307, 5486, 13]}


 Doing Preprocess each sample


 After preprocessing-  {'sent1_tokens': [5246, 13, 370, 3202, 6440, 318, 1016, 1462, 2740, 284, 345, 706, 314, 5461, 13], 'sent2_tokens': [8499, 314, 1101, 1760, 11, 1770, 13, 370, 3202, 6440, 481, 307, 5486, 13], 'prefix_sent': array([50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
       50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
       50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
       50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,  5246,
          13,   370,  3202,  6440,   318,  1016,  1462,  2740,   284,
         345,   706,   314,  5461,    13]), 'suffix_sent': array([ 8499,   314,  1101,  1760,    11,  1770,    13,   370,  3202,
        6440,   481,   307,  5486,    13, 502

In [17]:
# create dataloader
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler = train_sampler, batch_size = train_batch_size)

print("DVP train dataloader created")

DVP train dataloader created


In [18]:
# Total steps needed
t_total = len(train_dataloader) // gradient_accumulation_steps * num_epochs

# setting up the optimizer & learning rate schedulers
no_decay = ['bias', 'LayerNorm.weight', 'layer_norm.weight']
grouped_parameters = [
    {
        'params': [p for n, p in model.named_parameters()],
        'weight_decay': 0.0
    }
]

optimizer = AdamW(grouped_parameters, lr = float(learning_rt), eps = adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = t_total)

print("Adam Optimizer and learning rate scheduler instantiated")



Adam Optimizer and learning rate scheduler instantiated


In [19]:
# Training meta Information
print("Num of examples- ", len(train_dataset))
print("Num of epochs- ", num_epochs)
print("Batch size- ", train_batch_size)
print("Gradient acculmulation steps- ", gradient_accumulation_steps)
print("Total optimization steps- ", t_total)

Num of examples-  10
Num of epochs-  3
Batch size-  5
Gradient acculmulation steps-  2
Total optimization steps-  3


In [20]:
# zero out all the gradients
model.zero_grad()

In [21]:
# Fn to save checkpoints
def save_model(model, tokenizer, chkpt_dir, args_dir):
  if not os.path.exists(chkpt_dir):
    os.makedirs(chkpt_dir)
  print("Directory created for new checkpt to save")

  model.save_pretrained(chkpt_dir)
  tokenizer.save_pretrained(chkpt_dir)
  print("Model and tokenizer saved")

  # save training arguments also
  with open(chkpt_dir + "/my_args.json", "w") as json_file:
    json.dump(args_dir, json_file)
  print("Training arguments saved")

  with open(os.path.join(chkpt_dir, "global_step.txt"), "w") as f:
    f.write(str(global_step) + "\n")
  print("Global step file saved")

  print("Checkpint saving process done..")

In [22]:
from tqdm import tqdm

global_step = 0
train_loss_val = 0.0
chkpts_dir_name = []

# start training
train_iterator = trange(int(num_epochs), desc = "Epoch")
for epoch in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc = "Iteration")

    for batch_idx, batch in enumerate(epoch_iterator):
      sentences = batch["sentence"].to(device)
      labels = batch["label"].to(device)
      segments = batch["segment"].to(device)
      model.train()

      outputs = model(input_ids=sentences, token_type_ids=segments, labels=labels)
      print("Got logits and loss")

      loss = outputs.loss
      loss = loss / gradient_accumulation_steps
      train_loss_val += loss.item()

      loss.backward()

      if (((batch_idx + 1) % gradient_accumulation_steps) == 0):
        print("Moved 1 step")
        F.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()

        model.zero_grad()
        global_step += 1

        if (global_step % save_steps == 0):
          # save checkpoint here
          print("Saving new checkpoint")
          chkpt_dir = paraphrase_model_chkpts_dir + "/dvp_chkpt_"+str(global_step)
          chkpts_dir_name.append("dvp_chkpt_"+str(global_step))

          save_model(model, tokenizer, chkpt_dir, args_dir)
      break

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]
Iteration:   0%|          | 0/2 [00:03<?, ?it/s]
Epoch:  33%|███▎      | 1/3 [00:03<00:07,  3.78s/it]

Got logits and loss



Iteration:   0%|          | 0/2 [00:00<?, ?it/s]
Epoch:  67%|██████▋   | 2/3 [00:03<00:01,  1.66s/it]

Got logits and loss



Iteration:   0%|          | 0/2 [00:00<?, ?it/s][A

Got logits and loss


Iteration:   0%|          | 0/2 [00:00<?, ?it/s]
Epoch: 100%|██████████| 3/3 [00:04<00:00,  1.39s/it]


In [23]:
# Average train_loss per step
global_step, tr_loss = global_step, train_loss_val / (global_step + 1) # +1 only while testing
print("Final Global step- ", global_step)
print("Average training loss per step- ", tr_loss)

Final Global step-  0
Average training loss per step-  133.3716812133789


In [24]:
# save the last model also
global_step = 3  # remove this also; fetch the last global step value
chkpt_dir = paraphrase_model_chkpts_dir + "/dvp_chkpt_"+str(global_step)
chkpts_dir_name.append("dvp_chkpt_"+str(global_step))

save_model(model, tokenizer, chkpt_dir, args_dir)

print("Last model state saved")

Directory created for new checkpt to save
Model and tokenizer saved
Training arguments saved
Global step file saved
Checkpint saving process done..
Last model state saved


In [25]:
# Till now,
# DVP trained -> checkpoints saved -> Last model state saved
print("Checkpoints saved with the name- ", chkpts_dir_name)

Checkpoints saved with the name-  ['dvp_chkpt_3']


In [26]:
# Start Evaluation
print("Starting Evaluation of DVP on dev data based on perplexity")

Starting Evaluation of DVP on dev data based on perplexity


In [27]:
# Get validation dataset and dataloader
import tqdm
val_dataset = Diverse_Paraphraser_Dataset(paranmt_dataset_dir,
                                            INPUT_FORMAT_CONFIG,
                                            tokenizer,
                                            limit_examples = 10,
                                            evaluate = True, split_type = "dev")

val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler = val_sampler, batch_size = eval_batch_size)

print("Validation dataset and dataloader created")



 Original raw data in loaded pickle file-  ('FRAG > CD , CD , CD , CD , CC VP .', 'FRAG > CD , CD , CD , CD CC VBP .', False, '1, 2, 3, 4, and get the hell out of there!', 'one, two, three, four and fall!', (0.1, 0.16666666666666666, 0.125), (-1.0, True), 2, ('en', 'en'))


 Converting samples to dictionary form...


100%|██████████| 1492/1492 [00:00<00:00, 4470.18it/s]



 After conversion-  {'sent1_tokens': [16, 11, 362, 11, 513, 11, 604, 11, 290, 651, 262, 5968, 503, 286, 612, 0], 'sent2_tokens': [505, 11, 734, 11, 1115, 11, 1440, 290, 2121, 0]}


 Doing Preprocess each sample


 After preprocessing-  {'sent1_tokens': [16, 11, 362, 11, 513, 11, 604, 11, 290, 651, 262, 5968, 503, 286, 612, 0], 'sent2_tokens': [505, 11, 734, 11, 1115, 11, 1440, 290, 2121, 0], 'prefix_sent': array([50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
       50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
       50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
       50259, 50259, 50259, 50259, 50259, 50259, 50259,    16,    11,
         362,    11,   513,    11,   604,    11,   290,   651,   262,
        5968,   503,   286,   612,     0]), 'suffix_sent': array([  505,    11,   734,    11,  1115,    11,  1440,   290,  2121,
           0, 50261, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
       50259, 50259, 50259, 50259, 5025




In [28]:
# Validation meta Information
print("Num of examples- ", len(val_dataset))
print("Batch size- ", eval_batch_size)

Num of examples-  10
Batch size-  5


In [29]:
# Fn to evaluate on a DVP checkpoint
def evaluate(model, tokenizer, chkpt_dir_name, val_dataloader):
  val_loss = 0.0
  model.eval()

  for i, batch in enumerate(val_dataloader):
    sentences = batch["sentence"].to(device)
    labels = batch["label"].to(device)
    segments = batch["segment"].to(device)

    with torch.no_grad():
      op = model(input_ids=sentences, token_type_ids=segments, labels=labels)
      loss_val = op.loss.item()

    val_loss += loss_val
    break

  avg_val_loss = val_loss / (i + 1) # per batch average loss
  perplexity = torch.exp(torch.tensor(avg_val_loss)) # perplexity of exp(avg_loss)

  return perplexity

In [30]:
chkpts_dir_name

['dvp_chkpt_3']

In [31]:
# Start evaluating the checkpoints on dev data and using perplexity as a measure
perplexity_list = []

for chkpt_name in chkpts_dir_name:
  # load chkpoint
  chkpt_to_load = paraphrase_model_chkpts_dir + "/" + chkpt_name
  model = model_class.from_pretrained(chkpt_to_load)
  tokenizer = tokenizer_class.from_pretrained(chkpt_to_load, do_lower_case = True)
  model.to(device)
  print("Checkpoint- " + chkpt_name + " loaded")

  # evaluate loaded
  print("Evaluating on loaded checkpoint")
  perplexity = evaluate(model, tokenizer, chkpt_dir, val_dataloader)
  perplexity_list.append((chkpt_name, perplexity))

print("DVP evaluated on all the saved checkpoints")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Checkpoint- dvp_chkpt_3 loaded
Evaluating on loaded checkpoint
DVP evaluated on all the saved checkpoints


In [32]:
# Sort perplexity list in increasing order to get best model
perplexity_list.sort(key=lambda x: x[1].item())
top_chkpt_name = perplexity_list[0][0]

print("Top performing checkpoint is- ", top_chkpt_name)

Top performing checkpoint is-  dvp_chkpt_3


In [147]:
# Evaluation on dev data done

In [33]:
# move top performing checkpoint to other final_paraphrase_model dir
copy_cmd = "cp {}/* {}".format(paraphrase_model_chkpts_dir + "/" + top_chkpt_name, final_paraphrase_model_dir)
copy_cmd

'cp /content/drive/MyDrive/IRE/DVP/dvp_chkpt_3/* /content/drive/MyDrive/IRE/DVP/final_DVP'

In [34]:
# do copy to some other location
subprocess.check_output(copy_cmd, shell=True)
print("Copied successfully..!!")

Copied successfully..!!


In [35]:
# just to verify that copied model is loading correctly or not
chkpt_to_load = final_paraphrase_model_dir
model = model_class.from_pretrained(chkpt_to_load)
tokenizer = tokenizer_class.from_pretrained(chkpt_to_load, do_lower_case = True)

print("Model loaded successfully..!!")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model loaded successfully..!!
