In [1]:
# Inference; Given a sentence and target style, transfer the style

In [21]:
# ----------- IMP -------------
# Change value of DEVICE_1 to 1 when running in kaggle

In [1]:
!pip install transformers

[0m

In [None]:
!pip install profanity_filter # ----- CHECK THIS -------

In [25]:
!pip install cmake



In [2]:
# Desired imports
import torch
from transformers import (WEIGHTS_NAME, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
import numpy as np
import torch.nn.functional as F
import tqdm

In [3]:
# Required Directories
final_invp_paraphrase_root_dir = "/content/drive/MyDrive/IRE/INVP/final_INVP"
final_dvp_paraphrase_root_dir = "/content/drive/MyDrive/IRE/DVP/final_DVP"
src_data_samples_dir = "/content/drive/MyDrive/IRE/SOURCE_SAMPLES"
final_style_transferred_dir = "/content/drive/MyDrive/IRE/STYLE_TRANSFERRED"

# Target style that are only possible; since inverse paraphraser models are trained only
possible_target_style = {
    "shakespeare" : "shakespeare", # style : invp_model_dir
    "poetry" : "poetry",
    "bible" : "bible"
}

# Root dir for source style data_samples
src_data_samples_fname = { # style:fname_to_read
    "shakespeare" : "shakespeare.txt",
    "poetry" : "romantic_poetry.txt",
    "bible" : "bible.txt",
    "aae" : "aae.txt",
    "joyce" : "joyce.txt",
    "switchboard" : "switchboard.txt",
    "english_tweets" : "english_tweets.txt"
}

In [4]:
# Choose device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpus = torch.cuda.device_count()

print("Device- ", device)
print("No. of GPUs- ", n_gpus)

DEVICE_0 = 0
DEVICE_1 = 0 # change here to 1 when run in kaggle; enables inverse paraphraser to load in other GPU

Device-  cuda
No. of GPUs-  1


In [5]:
# Model & tokenizer type that will be used to load models
MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
}
_, model_class, tokenizer_class = MODEL_CLASSES["gpt2"]

print("GPT2 Model class- ", model_class)
print("GPT2 Tokenizer class- ", tokenizer_class)

GPT2 Model class-  <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
GPT2 Tokenizer class-  <class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>


In [6]:
dvp_on_device = DEVICE_0
invp_on_device = DEVICE_1

In [9]:
# load DVP
with torch.cuda.device(dvp_on_device):
  print("Loading Diverse Paraphraser...")
  dvp_chkpoint = final_dvp_paraphrase_root_dir
  dvp_paraphraser = model_class.from_pretrained(dvp_chkpoint)
  dvp_tokenizer = tokenizer_class.from_pretrained(dvp_chkpoint, do_lower_case = True)

  dvp_paraphraser.to(torch.cuda.current_device())

  print("Done loading Diverse Paraphraser..!!")

Loading Diverse Paraphraser...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Done loading Diverse Paraphraser..!!


In [11]:
# Init configs

MAX_PARAPHRASE_LEN = 100

# mainly to handle input
CONFIG = {
    "max_prefix_length": int(MAX_PARAPHRASE_LEN / 2),
    "max_suffix_length": int(MAX_PARAPHRASE_LEN / 2)
}

In [12]:
# Do Nucleas sampling and greedy decoding; vary top_p to consider more diversity
def top_k_top_p_filtering(logits, top_k = 0, top_p = 0.0, filter_amt = -float('Inf')):
  top_k = min(top_k, logits.size(-1))  # tok_k should be less than total vocab size

  if top_p > 0.0:
      sorted_logits, sorted_idxs = torch.sort(logits, descending = True)
      cumul_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

      # Remove tokens with cumulative probability above the threshold
      sorted_idxs_to_remove = cumul_probs > top_p

      # Shift the indices to the right to keep also the first token above the threshold
      sorted_idxs_to_remove[..., 1:] = sorted_idxs_to_remove[..., :-1].clone()
      sorted_idxs_to_remove[..., 0] = 0

      # scatter sorted tensors to original indexing
      indices_to_remove = sorted_idxs_to_remove.scatter(dim = 1, index = sorted_idxs, src = sorted_idxs_to_remove)
      logits[indices_to_remove] = filter_amt

  elif top_k > 0:
      # Remove all tokens with a probability less than the last token of the top_k
      indices_to_remove = logits < torch.topk(logits, int(top_k))[0][..., -1, None]
      logits[indices_to_remove] = filter_amt

  return logits

In [29]:
# Get output logits from paraphraser
def get_logits(paraphraser, idx, sents, segments, past):
  if idx == 0:
      pred = paraphraser(input_ids = sents, token_type_ids = segments, return_dict=True)
  else:
      # used the cached representations to speed up decoding
      pred = paraphraser(input_ids = sents[:, -1:], token_type_ids = segments[:, -1:], past_key_values = past, return_dict = True)

  logits = pred['logits']
  past_keys = pred['past_key_values']

  return logits, past_keys

In [30]:
# Decide generation lenght and get converted output and score
def generate(paraphraser, sents_to_paraphrase, segments, eos_token_id, top_p, top_k, len_to_gen):
  batch_size = sents_to_paraphrase.shape[0] # total sents in batch

  eos_emitted = [False for _ in range(batch_size)]
  scores = [{"score": 0, "sequence": []} for _ in range(batch_size)]

  with torch.no_grad():
    past_keys = None

    for i in range(len_to_gen):
      op_logits, past_keys = get_logits(paraphraser, i, sents_to_paraphrase, segments, past_keys)
      next_token_logits = op_logits[:, -1, :]
      original_scores = F.log_softmax(next_token_logits, dim = -1)

      # do nucleas filtering and greedy decoding
      filtered_logits = top_k_top_p_filtering(next_token_logits, top_k = top_k, top_p = top_p)

      if top_k in [0, 1] and top_p == 0.0: # mainly to control the output diversity
        # greedy sampling
        next_token = torch.argmax(filtered_logits, dim = -1).unsqueeze(-1)
      else :
        next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples = 1)

      sents_to_paraphrase = torch.cat((sents_to_paraphrase, next_token), dim=1)
      segments = torch.cat((segments, segments[:, -1:]), dim=1)

      for batch_elem in range(batch_size):
        if next_token[batch_elem].item() == eos_token_id:
            eos_emitted[batch_elem] = True

      if len_to_gen is None and all(eos_emitted):
        break

  return sents_to_paraphrase, scores

In [15]:
# Preprocess input from to paraphrase
def preprocess(exp, tokenizer, config):
  max_prefix_len = config["max_prefix_length"]
  sent1 = np.array(exp["sent1_tokens"])

  # truncate
  if(len(sent1) > max_prefix_len):
    sent1 = sent1[:max_prefix_len]

  # add padding; left padding to prefix and right padding to suffix
  count_pad_tokens_prefix = max_prefix_len - len(sent1)
  sent1 = np.pad(sent1, (count_pad_tokens_prefix, 0), constant_values = tokenizer.pad_token_id)

  # sentence to input gpt2
  sentence_to_input_gpt2 = np.concatenate([sent1, [tokenizer.bos_token_id]]).astype(np.int64) # [sent1, <bos>]

  # segment
  segment = np.concatenate([
      [tokenizer.additional_special_tokens_ids[0] for _ in sent1],
      [tokenizer.additional_special_tokens_ids[1]]
  ]).astype(np.int64)

  exp["input"] = sentence_to_input_gpt2
  exp["segment"] = segment

  return exp

In [16]:
# Generate paraphrased sentences for batch of input sentences
def generate_paraphrased_sent(paraphraser, upper_length, top_p, top_k, sents, config, device, tokenizer):
  examples = []

  for sent in sents:
    token_ids = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sent))
    dd = {"sent1_tokens":token_ids}

    dd_process = preprocess(dd, tokenizer, config)
    examples.append(dd_process)

  init_context_size = 1 + config["max_prefix_length"]

  # print("example to paraphrase-0 ", examples[0])

  output, scores = generate(paraphraser,
      sents_to_paraphrase = torch.tensor([inst["input"] for inst in examples]).to(device),
      segments = torch.tensor([inst["segment"] for inst in examples]).to(device),
      eos_token_id = tokenizer.eos_token_id,
      top_p = top_p, top_k = top_k,
      len_to_gen =  config["max_suffix_length"] + 1  # +1 for <eos>
  )

  all_ops = []
  for idx in range(len(output)):
    exmp = examples[idx]
    curr_out = output[idx, init_context_size:].tolist()

    if tokenizer.eos_token_id in curr_out:
      curr_out = curr_out[:curr_out.index(tokenizer.eos_token_id)]

    if upper_length.startswith("same"):
      extra = int(upper_length.split("_")[-1])
      curr_out = curr_out[:len(exmp["sent1_tokens"]) + extra]

    all_ops.append(tokenizer.decode(curr_out, clean_up_tokenization_spaces = True, skip_special_tokens = True))

  return all_ops, scores

In [17]:
# Load INVP for target style
def load_inverse_paraphraser(target_style, device_no, invp_paraphrase_root_dir, model_class, tokenizer_class):

  with torch.cuda.device(device_no):
    print("Loading target style inverse paraphraser...")
    invp_chkpoint = invp_paraphrase_root_dir + "/" + target_style
    invp_paraphraser = model_class.from_pretrained(invp_chkpoint)
    invp_tokenizer = tokenizer_class.from_pretrained(invp_chkpoint, do_lower_case = True)

    invp_paraphraser.to(torch.cuda.current_device())
    print("Done loading target style Inverse Paraphraser..!!")

    return invp_paraphraser, invp_tokenizer

In [27]:
def transfer_for_single_sentence(model_class, tokenizer_class, target_style, sents_to_transform,
                                 invp_paraphrase_root_dir, invp_on_device,
                                 dvp_paraphraser, dvp_tokenizer, dvp_on_device, config, device):

  input_samples = sents_to_transform

  # load inverse paraphraser
  invp_paraphraser, invp_tokenizer = load_inverse_paraphraser(target_style, invp_on_device, invp_paraphrase_root_dir,
                                                              model_class, tokenizer_class)
  # For DVP
  top_p_paraphrase = 0.0
  top_k = 1

  # Get outout from DVP
  with torch.cuda.device(dvp_on_device):
    dvp_outputs, _ = generate_paraphrased_sent(dvp_paraphraser,"same_5",
                                            top_p_paraphrase, top_k,
                                            input_samples, config, torch.cuda.current_device(), dvp_tokenizer)

  # Pass dvp_output to loaded INVP
  top_p_style = 0.7
  top_k = 1
  with torch.cuda.device(invp_on_device):
    style_transferred_outputs, _ = generate_paraphrased_sent(invp_paraphraser, "same_5",
                                                        top_p_style, top_k, dvp_outputs,
                                                        config, torch.cuda.current_device(), invp_tokenizer)

  return style_transferred_outputs # list of strings

In [32]:
# Sent to transform and target style
sent_to_transform = "my name is ayush"
source_style = "bible" # btw; it does not even matter, only thing that matters is the target style; only used to get source style sample
target_style = "shakespeare"

style_transferred_sent = transfer_for_single_sentence(model_class, tokenizer_class,
                                                      target_style, [sent_to_transform],
                                                      final_invp_paraphrase_root_dir, invp_on_device,
                                                      dvp_paraphraser, dvp_tokenizer, dvp_on_device, CONFIG, device)

print("\nSent [ORIGINAL]- ", sent_to_transform)
print("Sent ["+target_style+"]- ", style_transferred_sent[0])

Loading target style inverse paraphraser...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Done loading target style Inverse Paraphraser..!!

Sent [ORIGINAL]-  my name is ayush
Sent [shakespeare]-  


In [33]:
# for multiple sentences and to save them in file
def do_style_transformation(from_style, to_style, src_data_samples_dir, data_samples_fname, style_transferred_dir):

  # read source data
  file_name_to_read = src_data_samples_dir + "/" + data_samples_fname[from_style]
  with open(file_name_to_read, "r") as f:
    data_orig = f.read().strip().split("\n")

  data_to_convert = data_orig[:200]

  batch_size = 50
  style_transferred_sents = []

  for i in tqdm.tqdm(range(0, len(data_to_convert), batch_size)):
    sents_to_tranform = data_to_convert[i : i + batch_size]
    style_transferred_sents.extend(sents_to_tranform)

  # save style transferred sentences to file with the name <from_style>_To_<to_style>.txt in style_transferred_dir
  file_path = style_transferred_dir + "/" + from_style + "_To_" + to_style + ".txt"
  with open(file_path, "w") as f:
    f.writelines("%s\n" % item for item in style_transferred_sents)

  print(f"Transformation done for ({from_style}, {to_style})")

In [34]:
# Do for multiple sents pair;
# target_style must be ["shakespeare", "bible", "poetry"] becoz INVP are only trained for them

pairs_to_transform = [("bible", "shakespeare"), ("poetry", "shakespeare"), ("poetry", "bible")] # (from, to)
for (from_style, to_style) in pairs_to_transform:
  do_style_transformation(from_style, to_style, src_data_samples_dir,
                          src_data_samples_fname, final_style_transferred_dir)

100%|██████████| 1/1 [00:00<00:00, 9709.04it/s]


Transformation done for (bible, shakespeare)


In [None]:
# Inferencing done