In [None]:
# Using DVP, paraphrase the sentences

In [33]:
# ------------------ IMP ----------------------
# Comment data shortening line while paraphrasing datafiles
# Change batch size if needed

In [8]:
# Install Transformers
!pip install transformers



In [1]:
# Desired imports
import torch
import tqdm
from tqdm import trange
from transformers import (WEIGHTS_NAME, AdamW, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, get_linear_schedule_with_warmup)
import json
import glob
import torch.nn.functional as F
import numpy as np

In [7]:
# Directories needed
final_paraphrase_model_dir = "/content/drive/MyDrive/IRE/DVP/final_DVP"
data_to_paraphrased_dirs = "/content/drive/MyDrive/IRE/DVP_PARAPHRASED"

In [6]:
# Choose device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpus = torch.cuda.device_count()

print("Device- ", device)
print("No. of GPUs- ", n_gpus)

Device-  cpu
No. of GPUs-  0


In [10]:
# Load model, tokenizer and arguments

# Initialize model classes variables
MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
}
_, model_class, tokenizer_class = MODEL_CLASSES["gpt2"]

print("GPT2 Model class- ", model_class)
print("GPT2 Tokenizer class- ", tokenizer_class)

chkpt_to_load = final_paraphrase_model_dir
paraphraser = model_class.from_pretrained(chkpt_to_load)
tokenizer = tokenizer_class.from_pretrained(chkpt_to_load, do_lower_case = True)

paraphraser.to(device)

print("Diverse Paraphraser loaded along with the desired tokenizer")

GPT2 Model class-  <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
GPT2 Tokenizer class-  <class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Diverse Paraphraser loaded along with the desired tokenizer


In [44]:
# Variables needed for paraphrasing
upper_length = "same_5" # change last value like _10, _15
temparature = 0
device = device
batch_size = 64

In [45]:
# Init configs

MAX_PARAPHRASE_LEN = 100

# mainly to handle input
INPUT_FORMAT_CONFIG = {
    "max_prefix_length": int(MAX_PARAPHRASE_LEN / 2),
    "max_suffix_length": int(MAX_PARAPHRASE_LEN / 2)
}

In [66]:
# Do Nucleas sampling and greedy decoding; vary top_p to consider more diversity
def top_k_top_p_filtering(logits, top_k = 0, top_p = 0.0, filter_amt = -float('Inf')):
  top_k = min(top_k, logits.size(-1))  # tok_k should be less than total vocab size

  if top_p > 0.0:
      sorted_logits, sorted_idxs = torch.sort(logits, descending=True)
      cumul_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

      # Remove tokens with cumulative probability above the threshold
      sorted_idxs_to_remove = cumul_probs > top_p

      # Shift the indices to the right to keep also the first token above the threshold
      sorted_idxs_to_remove[..., 1:] = sorted_idxs_to_remove[..., :-1].clone()
      sorted_idxs_to_remove[..., 0] = 0

      # scatter sorted tensors to original indexing
      indices_to_remove = sorted_idxs_to_remove.scatter(dim = 1, index = sorted_idxs, src = sorted_idxs_to_remove)
      logits[indices_to_remove] = filter_amt

  elif top_k > 0:
      # Remove all tokens with a probability less than the last token of the top_k
      indices_to_remove = logits < torch.topk(logits, int(top_k))[0][..., -1, None]
      logits[indices_to_remove] = filter_amt

  return logits

In [85]:
# Get output logits from paraphraser
def get_logits(paraphraser, idx, sents, segments, past):
  if idx == 0:
      pred = paraphraser(input_ids = sents, token_type_ids = segments, return_dict=True)
  else:
      # used the cached representations to speed up decoding
      print("Logits to calculate of shape- ", sents[:, -1:].shape)
      pred = paraphraser(input_ids = sents[:, -1:], token_type_ids = segments[:, -1:], past_key_values = past, return_dict = True)

  logits = pred['logits']
  past_keys = pred['past_key_values']

  return logits, past_keys

In [82]:
# Decide generation lenght and get converted output and score
def generate(paraphraser, sents_to_paraphrase, segments, eos_token_id, top_p, top_k, len_to_gen):
  batch_size = sents_to_paraphrase.shape[0] # total sents in batch
  print("batch size- ", batch_size)

  eos_emitted = [False for _ in range(batch_size)]
  scores = [{"score": 0, "sequence": []} for _ in range(batch_size)]
  print("sents to rephrase- ", sents_to_paraphrase)

  with torch.no_grad():
    past_keys = None

    for i in range(len_to_gen):
      op_logits, past_keys = get_logits(paraphraser, i, sents_to_paraphrase, segments, past_keys)
      print("op_logits shape- ", op_logits.shape)

      next_token_logits = op_logits[:, -1, :]
      print("next token logits of shape- ", next_token_logits.shape)
      original_scores = F.log_softmax(next_token_logits, dim = -1)

      # do nucleas filtering and greedy decoding
      filtered_logits = top_k_top_p_filtering(next_token_logits, top_k = top_k, top_p = top_p)
      print("filtered_logits of shape- ", filtered_logits.shape)

      if top_k in [0, 1] and top_p == 0.0: # mainly to control the output diversity
        # greedy sampling
        next_token = torch.argmax(filtered_logits, dim = -1).unsqueeze(-1)
      else :
        next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples = 1)

      print("next token- ", next_token, " and of shape- ", next_token.shape)

      for batch_elem in range(batch_size):
          if eos_emitted[batch_elem]:
              continue
          scores[batch_elem]["score"] += original_scores[batch_elem, next_token[batch_elem].item()].item()
          scores[batch_elem]["sequence"].append("token")

      sents_to_paraphrase = torch.cat((sents_to_paraphrase, next_token), dim=1)
      segments = torch.cat((segments, segments[:, -1:]), dim=1)

      print("concat sents to change- ", sents_to_paraphrase, " and of shape- ", sents_to_paraphrase.shape)

      for batch_elem in range(batch_size):
        if next_token[batch_elem].item() == eos_token_id:
            eos_emitted[batch_elem] = True

      if len_to_gen is None and all(eos_emitted):
        break

  scores = [x["score"] / len(x["sequence"]) for x in scores]

  return sents_to_paraphrase, scores

In [51]:
# Preprocess input from to paraphrase
def preprocess(exp, tokenizer, config, do_tokenize = True):
  max_prefix_len = config["max_prefix_length"]
  sent1 = np.array(exp["sent1_tokens"])

  # truncate
  if(len(sent1) > max_prefix_len):
    sent1 = sent1[:max_prefix_len]

  # add padding; left padding to prefix and right padding to suffix
  count_pad_tokens_prefix = max_prefix_len - len(sent1)
  sent1 = np.pad(sent1, (count_pad_tokens_prefix, 0), constant_values = tokenizer.pad_token_id)

  # sentence to input gpt2
  sentence_to_input_gpt2 = np.concatenate([sent1, [tokenizer.bos_token_id]]).astype(np.int64) # [sent1, <bos>]

  # segment
  segment = np.concatenate([
      [tokenizer.additional_special_tokens_ids[0] for _ in sent1],
      [tokenizer.additional_special_tokens_ids[1]]
  ]).astype(np.int64)

  exp["input"] = sentence_to_input_gpt2
  exp["segment"] = segment

  return exp

In [83]:
# Generate paraphrased sentences for batch of input sentences
def generate_paraphrased_sents_batchwise(paraphraser, upper_length, top_p, top_k, sents, config, device, tokenizer):
  examples = []

  for sent in sents:
    token_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))
    dd = {"sent1_tokens":token_ids}

    dd_process = preprocess(dd, tokenizer, config, do_tokenize = False)
    examples.append(dd_process)

  init_context_size = 1 + config["max_prefix_length"]

  print("example to paraphrase-0 ", examples[0])

  output, scores = generate(paraphraser,
      sents_to_paraphrase = torch.tensor([inst["input"] for inst in examples]).to(device),
      segments = torch.tensor([inst["segment"] for inst in examples]).to(device),
      eos_token_id = tokenizer.eos_token_id,
      top_p = top_p, top_k = top_k,
      len_to_gen =  config["max_suffix_length"] + 1  # +1 for <eos>
  )

  all_ops = []
  for idx in range(len(output)):
    exmp = examples[idx]
    curr_out = output[idx, init_context_size:].tolist()

    if tokenizer.eos_token_id in curr_out:
      curr_out = curr_out[:curr_out.index(tokenizer.eos_token_id)]

    if upper_length.startswith("same"):
      extra = int(upper_length.split("_")[-1])
      curr_out = curr_out[:len(exmp["sent1_tokens"]) + extra]

    all_ops.append(tokenizer.decode(curr_out, clean_up_tokenization_spaces = True, skip_special_tokens = True))

  return all_ops, scores

In [13]:
database_config = {
    "shakespeare" : (data_to_paraphrased_dirs + "/shakespeare_data", 0.0, 1.0), # (data_dir, top_p, top_k)
    "bible" : (data_to_paraphrased_dirs + "/bible_data", 0.0, 1.0),
    "poetry" : (data_to_paraphrased_dirs + "/poetry_data", 0.0, 1.0)
}

In [11]:
database_config

{'shakespeare': ('/content/drive/MyDrive/IRE/DVP_PARAPHRASED/shakespeare_data',
  0.0,
  1.0),
 'bible': ('/content/drive/MyDrive/IRE/DVP_PARAPHRASED/bible_data', 0.0, 1.0),
 'poetry': ('/content/drive/MyDrive/IRE/DVP_PARAPHRASED/poetry_data',
  0.0,
  1.0)}

In [86]:
# Convert sentences to their paraphrased version and saved
for dataset_name, dataset_config in database_config.items():
  print("Starting paraphrasing for ", dataset_name)

  for split in ["train", "test", "dev"]:
    file_dir = dataset_config[0] + "/" + split + ".txt"
    top_p = dataset_config[1]
    top_k = dataset_config[2]

    with open(file_dir, "r") as f:
      data = f.read().strip().split("\n")

    # FOR PIPELINE TESTING PURPOSE
    data = data[:1]

    paraphrased_sents = []
    for i in tqdm.tqdm(range(0, len(data), batch_size)):
        generations, scores = generate_paraphrased_sents_batchwise(paraphraser, upper_length, top_p, top_k, data[i:i + batch_size], INPUT_FORMAT_CONFIG, device, tokenizer)
        paraphrased_sents.extend(generations)

    print(split + " Sentences paraphrased for ", dataset_name)

    # save file
    fname = dataset_config[0] + "/" + split + ".dvp_paraphrased.txt"
    with open(fname, "w") as f:
      f.write("\n".join(paraphrased_sents) + "\n")

    print(split + " Paraphrased sentences for " + dataset_name + " saved in file")

  print("Done paraphrasing for ", dataset_name)

Starting paraphrsing for  shakespeare


  0%|          | 0/1 [00:00<?, ?it/s]

example to paraphrase-0  {'sent1_tokens': [40, 423, 257, 2000, 284, 5587, 17903, 304, 260, 14210, 2740, 338, 83, 764], 'input': array([50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
       50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
       50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
       50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
          40,   423,   257,  2000,   284,  5587, 17903,   304,   260,
       14210,  2740,   338,    83,   764, 50260]), 'segment': array([50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
       50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
       50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
       50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
       50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257,
       50257, 50257, 50257, 50257, 50257, 50258])}
batch size-  1
sents to rephrase-  tensor([[50259, 50259, 5

100%|██████████| 1/1 [00:05<00:00,  5.11s/it]

op_logits shape-  torch.Size([1, 1, 50262])
next token logits of shape-  torch.Size([1, 50262])
filtered_logits of shape-  torch.Size([1, 50262])
next token-  tensor([[50259]])  and of shape-  torch.Size([1, 1])
concat sents to chnage-  tensor([[50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259,    40,   423,   257,  2000,
           284,  5587, 17903,   304,   260, 14210,  2740,   338,    83,   764,
         50260, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259, 5




In [88]:
# Till now,
# Paraphrasing done for datasets in database_config for each split
# Now convert to bpe files

In [None]:
# Install dependencies
# !pip install bitarray
# !pip install sacrebleu

In [3]:
# !pip install hydra-core omegaconf

Collecting hydra-core
  Downloading hydra_core-1.3.2-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.5/154.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting omegaconf
  Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting antlr4-python3-runtime==4.9.* (from hydra-core)
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144555 sha256=2062e0b4d10a2567358cf01bb528d70add8

In [2]:
roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')

Using cache found in /root/.cache/torch/hub/pytorch_fairseq_main
100%|██████████| 231160875/231160875 [00:04<00:00, 49655395.24B/s]
The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  self.delegate = real_initialize(
See https://hydra.cc/docs/1.2/upgrades/1.0_to_1.1/changes_to_package_header for more information
'config' is validated against ConfigStore schema with the same name.
This behavior is deprecated in Hydra 1.1 and will be removed in Hydra 1.2.
See https://hydra.cc/docs/1.2/upgrades/1.0_to_1.1/automatic_schema_matching for migration instructions.
  state = load_checkpoint_to_cpu(filename, arg_overrides)
The strict flag in the compose API is deprecated.
See https://hydra.cc/docs/1.2/upgrades/0.11_to_1.0/strict_mode_flag_deprecated for more info.

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  self.dele

In [5]:
# Fn to convert data to byte pair encoding file
def convert_to_bpe_and_save(data, fname, folder):
  bpe_data = [roberta.bpe.encode(x) for x in tqdm.tqdm(data)]
  with open(folder + "/" + fname, "w") as f:
        f.write("\n".join(bpe_data) + "\n")
  return

In [9]:
# Create BPE files for generated pseudo parellel data using Roberta
for dataset_name, dataset_config in database_config.items():
  print("Starting for ", dataset_name)

  for split in ["train", "dev", "test"]:
    bpe_folder = dataset_config[0] + "/BPE"
    file_name_orig_abs = dataset_config[0] + "/" + split + ".txt"
    file_name_orig_paraphrased = dataset_config[0] + "/" + split + ".dvp_paraphrased.txt"

    with open(file_name_orig_abs, "r") as f:
      data_orig = f.read().strip().split("\n")

    with open(file_name_orig_paraphrased, "r") as f:
      data_paraphrased = f.read().strip().split("\n")

    # convert original file and paraphrased files to bpe folder after conversion to bpe format using roberta.base
    convert_to_bpe_and_save(data_orig, split + ".input0.bpe", bpe_folder)
    convert_to_bpe_and_save(data_paraphrased, split + ".paraphrase_250_input0.bpe", bpe_folder)

    print("\nDone conversion for " + split + " split for dataset- " + dataset_name)

Starting for  shakespeare


100%|██████████| 8/8 [00:00<00:00, 1305.77it/s]

Done conversion for train split for dataset- shakespeare





In [15]:
# Done paraphrasing

In [14]:
# Hack:-
# We already have converted bpe files for every dataset; can directly use them for training inverse paraphraser
# As we don't have raw data available directly to train the model
# Only total of 1000 samples are there; which is very less and that also needs to be divided into train, test and dev splits.

# available in /content/drive/MyDrive/IRE_Project/style_transfer_paraphrase/datasets/cds

In [None]:
# Finished paraphrasing