<a href="https://colab.research.google.com/github/joshuaalpuerto/ML-guide/blob/main/LM_Paraphraser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%bash

pip install numpy requests nlpaug augly textacy datasets
pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece
pip install git+https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Parrot

In [None]:
from parrot import Parrot
import torch
import warnings
warnings.filterwarnings("ignore")

def random_state(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

random_state(1234)

#Init models (make sure you init ONLY once if you integrate this to your code)
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")

In [None]:

phrases = ["What are the best places to see in New York?"]

def para_phrase_augmenter(phrase):
  para_phrases = parrot.augment(input_phrase=phrase,
                               use_gpu=True,
                               diversity_ranker="levenshtein",
                               do_diverse=False,
                               max_return_phrases = 3,
                               max_length=50,
                               adequacy_threshold = 0.30,
                               fluency_threshold = 0.10)
  try:
    results = [para_phrase[0] for para_phrase in para_phrases]
  except TypeError:
    return []

  return results


for phrase in phrases:
  print("-"*100)
  print("Input_phrase: ", phrase)
  print("-"*100)
  para_phrases = para_phrase_augmenter(phrase)
  for para_phrase in para_phrases:
   print(para_phrase)


# humarin/chatgpt_paraphraser_on_T5_base

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cuda"

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

In [None]:
def paraphrase(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res