### Pre-requisite Installations

In [None]:
!pip install sentencepiece

In [None]:
!pip install transformers

### Training data Generation

In [None]:
import json
import pandas as pd
from random import sample, choices, choice
from transformers import AutoModelWithLMHead, AutoTokenizer

In [None]:
# Reading the manually curated data
file_path = '/content/drive/MyDrive/Colab Notebooks/VICCI/data/hand_written_train_data.json'
training_data = None
with open(file_path, 'r') as file:
    training_data = json.load(file)

In [None]:
# Reading the names of states 
file_path = '/content/drive/MyDrive/Colab Notebooks/VICCI/data/state_names.txt'
STATE_LIST = None
with open(file_path, 'r', encoding='utf-8-sig') as file:
    STATE_LIST = [sname.strip().lower() for sname in file.readlines()]

In [None]:
training_data[10]

{'intent': 'spread',
 'query': ['How does COVID-19 spread?',
  'How is COVID spreading?',
  'How can I get infected?',
  'What can I catch COVID?',
  'What is causing the virus to spread so fast?',
  'How is it spreading so fast?',
  'How can COVID-19 spread?',
  'How does this virus spreads?',
  'How is the virus spreading?',
  'How can I catch COVID?'],
 'response': ['People can catch COVID-19 from others who have the virus. The disease can spread from person to person through small droplets from the nose or mouth which are spread when a person with COVID-19 coughs or exhales. These droplets land on objects and surfaces around the person. Other people then catch COVID-19 by touching these objects or surfaces, then touching their eyes, nose or mouth. People can also catch COVID-19 if they breathe in droplets from a person with COVID-19 who coughs out or exhales droplets. This is why it is important to stay more than 1 meter (3 feet) away from a person who is sick.']}

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-small-finetuned-quora-for-paraphrasing")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-small-finetuned-quora-for-paraphrasing")

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [None]:
# Huggingface article by Manuel Romero
# https://huggingface.co/mrm8488/t5-small-finetuned-quora-for-paraphrasing

def paraphrase(text, num_sequences=5, max_length=128):
    # We increase the num_beams to 10
    input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
    generated_ids = model.generate(input_ids=input_ids, num_return_sequences=num_sequences, 
                        num_beams=10, max_length=max_length, no_repeat_ngram_size=2, 
                        repetition_penalty=3.5, length_penalty=1.0, early_stopping=True)
    preds = [tokenizer.decode(g, skip_special_tokens=True, 
                              clean_up_tokenization_spaces=True) for g in generated_ids]
    return preds

In [None]:
training_data_big=[]
for train_set in training_data:
    query_list = []
    if train_set['intent'] in ['greet','bye','thanks','intro']:
        # For these category of intents, the input questions are very short and crisp but the 
        # question phrases generated are too long and carry unnecessary context which is not relevant for us.
        # So we simply oversample our own training data
        query_list.extend(choices(train_set['query'], k=50))
    else:
        query_pool=set()
        # For other intents we collect all the paraphrases generated for each of our hand written questions
        for query in train_set['query']:
            # Some of the paraphrases are only case-different, so we convert them to lower case and take the set
            query_pool.update([phrase.lower() for phrase in paraphrase("paraphrase: "+query, num_sequences=5)])
        
        # if the question pool has more than 50 generated paraphrases we randomly sample any 50
        # if the count is less than 50, we take the whole set and randomly choose the remaining numbers again 
        if len(query_pool) >= 50:
            query_list.extend(sample(query_pool, k=50))
        else:
            query_list.extend(query_pool)
            query_list.extend(choices(list(query_pool), k=50-len(query_pool)))

            if train_set['intent']=='covid_numbers':
                # for this intent we only have the state name "Delhi" in our hand-written data set, but we dont want
                # to make the training data biased, so we replace "Delhi" with a randomly taken state name
                query_list = [ql.replace('delhi', choice(STATE_LIST)) for ql in query_list]
    
    training_data_big.append({'intent' : train_set['intent'], 'query' : query_list,
                                'response' : train_set['response']})

In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/VICCI/data/generated_train_data.json'
with open(file_path, 'w') as file:
    file.write(json.dumps(training_data_big, indent=4))