<a href="https://colab.research.google.com/github/joshuaalpuerto/ML-guide/blob/main/LM_Paraphraser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q -U git+https://github.com/huggingface/transformers.git --progress-bar off

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Parrot

In [None]:
!pip install git+https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git

In [None]:
from parrot import Parrot
import torch
import warnings
warnings.filterwarnings("ignore")

def random_state(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

random_state(1234)

#Init models (make sure you init ONLY once if you integrate this to your code)
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")

In [None]:

phrases = ["What are the best places to see in New York?"]

def para_phrase_augmenter(phrase):
  para_phrases = parrot.augment(input_phrase=phrase,
                               use_gpu=True,
                               diversity_ranker="levenshtein",
                               do_diverse=False,
                               max_return_phrases = 3,
                               max_length=50,
                               adequacy_threshold = 0.30,
                               fluency_threshold = 0.10)
  try:
    results = [para_phrase[0] for para_phrase in para_phrases]
  except TypeError:
    return []

  return results


for phrase in phrases:
  print("-"*100)
  print("Input_phrase: ", phrase)
  print("-"*100)
  para_phrases = para_phrase_augmenter(phrase)
  for para_phrase in para_phrases:
   print(para_phrase)


# humarin/chatgpt_paraphraser_on_T5_base

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cuda"

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [14]:
def paraphrase_augmenter(
    question,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=128
):
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids.cuda()

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return res

In [7]:
text = 'What are the best places to see in New York?'
paraphrase(text)



['What are some must-see places in New York?',
 'Can you suggest some must-see spots in New York?',
 'Where should one go to experience the best NYC has to offer?',
 'Which places should I visit in New York?',
 'What are the top destinations to explore in New York?']

In [16]:
import json
import pandas as pd

def load_json(json_file, columns=['question', 'answer', 'country']):
  with open(f"/content/drive/MyDrive/datasets/{json_file}", 'r') as json_file:
    json_data = json.load(json_file)

  json_data = json_data["questions"] if "questions" in json_data else json_data

  data = pd.DataFrame(json_data)

  data = data.dropna()

  data = data[columns]

  # Remove duplicates where question and answer is the same.
  data = data.drop_duplicates(subset=['question', 'answer'])

  return data

# data = load_json('qna-augmented.json')
data = load_json('qna-clean.json')


# Print modified data
print(len(data))
data.head()

331


Unnamed: 0,question,answer,country
0,"If I move to a new country, will my tax reside...","No, you'll have to notify the Estonian tax aut...",Estonia
1,Is tax residency linked to my ID?,Everything government-related is linked to you...,Estonia
2,When should I apply for a D visa?,In general the visa applications are reviewed ...,Estonia
3,How long does it take to receive a residence p...,"By law, the processing of your application can...",Estonia
4,"If I change employers in Estonia, should my ta...","No, tax residency is linked to you personally ...",Estonia


In [12]:
# phrases = [d['question'] for _, d in data[:10].iterrows()]
phrases = [d['question'] for _, d in data[:3].iterrows()]

for phrase in phrases:
  print("-"*100)
  print("Input_phrase: ", phrase)
  print("-"*100)
  para_phrases = paraphrase_augmenter(phrase)
  for para_phrase in para_phrases:
   print(para_phrase)


----------------------------------------------------------------------------------------------------
Input_phrase:  If I move to a new country, will my tax residency change automatically?
----------------------------------------------------------------------------------------------------




Will my tax residency be changed if I move to another country?
If I move to another country, will my tax residency be changed automatically?
Is it necessary to change my tax residency when moving to a different country?
Does the choice of country I move to affect my tax residency automatically?
Can I change my tax residency without any additional paperwork or legal requirements if I move to another country?
----------------------------------------------------------------------------------------------------
Input_phrase:  Is tax residency linked to my ID?
----------------------------------------------------------------------------------------------------
Does my ID proof relate to my tax residency status?
Is my tax residency associated with my identification?
Will my tax residency be affected by the way my ID is registered?
Can I use my ID proof to verify my tax residency?
Do I need to provide identification proof for tax residency?
------------------------------------------------------

In [17]:
import concurrent.futures
from tqdm import tqdm

# Example function to process each element of the array
def process_element(obj):
    # Extract desired properties from each object
    text = obj["question"]

    paraphrases = paraphrase_augmenter(text)

    result = []
    # Append the actual question
    result.append({
        "question": text,
        "answer": obj["answer"],
        "country": obj["country"],
    })
    # Append the paraphrased question
    for paraphrase in paraphrases:
        result.append({
            "question": paraphrase,
            "answer": obj["answer"],
            "country": obj["country"],
        })

    return result

processed_results = []

for _, d in data.iterrows():
  result = process_element(d)
  processed_results.extend(result)

print(processed_results)



[{'question': 'If I move to a new country, will my tax residency change automatically?', 'answer': "No, you'll have to notify the Estonian tax authorities by submitting Form R.", 'country': 'Estonia'}, {'question': 'Will my tax residency be changed if I move to another country?', 'answer': "No, you'll have to notify the Estonian tax authorities by submitting Form R.", 'country': 'Estonia'}, {'question': 'If I move to another country, will my tax residency be changed automatically?', 'answer': "No, you'll have to notify the Estonian tax authorities by submitting Form R.", 'country': 'Estonia'}, {'question': 'Is it necessary to change my tax residency when moving to a different country?', 'answer': "No, you'll have to notify the Estonian tax authorities by submitting Form R.", 'country': 'Estonia'}, {'question': 'Does the choice of country I move to affect my tax residency automatically?', 'answer': "No, you'll have to notify the Estonian tax authorities by submitting Form R.", 'country'

In [18]:
import json

FILE_NAME = "qna-clean-augmented.json"

processed_results = list(processed_results)

json_file_path = f"/content/drive/MyDrive/datasets/{FILE_NAME}"
# Write the data to the JSON file to out drive
with open(json_file_path, 'w') as json_file:
    json.dump(processed_results, json_file, indent=4)

print(f'JSON data written to {json_file_path}')

JSON data written to /content/drive/MyDrive/datasets/qna-augmented.json
