## Connect to Google Drive

Use:
* Saving the output file.
* Quicker loading of the heavy SynQA Model in between subsequent runs.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

## Download and Imports

Download all the required models and perform necessary imports for the models.

### KeyBert with KeyPhrase-Vectorizers

Model used for Answer Generation (using NER) for synQA Model.

In [None]:
!pip install keyphrase-vectorizers
!pip install keybert

In [None]:
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer

### synQA-question-generators

Answer Aware model for Question Generation.

In [None]:
!pip install transformers
!pip install fairseq

In [None]:
import requests
import tarfile
from tqdm import tqdm
import os
from fairseq.models.transformer import TransformerModel

In [None]:
def download_synQA(url: str, fname: str, desc: str = None) -> None:
    desc = desc if desc is not None else fname
    resp = requests.get(url, stream=True)
    total = int(resp.headers.get("content-length", 0))
    with open(fname, "wb") as file, tqdm(
        desc=fname, total=total, unit="iB", unit_scale=True, unit_divisor=1024
    ) as bar:
        for data in resp.iter_content(chunk_size=1024):
            size = file.write(data)
            bar.update(size)

In [None]:
def download_and_extract_synQA(MODEL_URL, MODELS_DIR):
    for model_filename, url in MODEL_URL.items():
        model_name = model_filename.split(".")[0]
        model_tarfile_path = os.path.join(MODELS_DIR, model_filename)
        model_dir = os.path.join(MODELS_DIR, model_name)

        if not os.path.exists(os.path.join(model_dir, "checkpoint_best.pt")):
            if not os.path.exists(model_tarfile_path):
                download_synQA(url, model_tarfile_path, url)

            # Extracting {model_filename} to {model_dir}
            with tarfile.open(model_tarfile_path) as f:
                # Get only the members with extensions (i.e. no directories)
                members = [
                    m
                    for m in f.getmembers()
                    if os.path.splitext(os.path.join(model_dir, m.name))[-1]
                ]
                # Flatten (i.e. remove directory info)
                for m in members:
                    m.name = os.path.basename(m.name)
                # Extract
                def is_within_directory(directory, target):
                    abs_directory = os.path.abspath(directory)
                    abs_target = os.path.abspath(target)
                    prefix = os.path.commonprefix([abs_directory, abs_target])
                    return prefix == abs_directory

                def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
                    for member in tar.getmembers():
                        member_path = os.path.join(path, member.name)
                        if not is_within_directory(path, member_path):
                            raise Exception("Attempted Path Traversal in Tar File")
                    tar.extractall(path, members, numeric_owner=numeric_owner)

                safe_extract(f, model_dir, members=members)

            # Remove tarfile
            os.remove(model_tarfile_path)

### t5_small

Answer Agnostic model for generating Question-Answer pairs.

In [None]:
!pip install transformers
!python -m nltk.downloader punkt
!git clone https://github.com/patil-suraj/question_generation.git

In [None]:
%cd question_generation
from pipelines import pipeline

### Spacy

NER Model which is used to decide which model is to be used for Question Generation.

In [None]:
import spacy
from spacy.symbols import *
import spacy.cli
spacy.cli.download("en_core_web_sm")

### Other imports

In [None]:
import csv
import json
import random
import pandas as pd

## Load Test Data

In [None]:
!gdown 1RAD2mJbz4yQddZZaUvXDWQC6Okar-QVh

In [None]:
def test_to_theme_wise(test_paragraphs):
    theme_wise_para = {}
    for item in test_paragraphs:
        if item['theme'] not in theme_wise_para.keys():
            theme_wise_para[item['theme']] = [[item['id'],item['paragraph']],]
        else:
            theme_wise_para[item['theme']].append([item['id'],item['paragraph']])
    return theme_wise_para

## Question-Answer Generator Functions

In [None]:
def get_ans_phrases(kw_model, context, top_n=10, use_maxsum=True, diversity=0.2, nr_candidates=20):
    contexts = [context]
    keyphr = kw_model.extract_keywords(docs=contexts, vectorizer=KeyphraseCountVectorizer(), top_n = top_n, use_maxsum=True, diversity=0.2, nr_candidates=20)
    keybert_phrases=[]
    for i in range(len(keyphr)):
        keybert_phrases.append(keyphr[i][0])
    return keybert_phrases

In [None]:
def convert_example_to_input(example):
    ex_input_inner = f" {SPECIAL_TOKENS['sep_token']} ".join(example)
    ex_input = (
        f"{SPECIAL_TOKENS['bos_token']} {ex_input_inner} {SPECIAL_TOKENS['eos_token']}"
    )
    return ex_input

In [None]:
def clean_special_tokens(text):
    for _, special_tok in SPECIAL_TOKENS.items():
        text = text.replace(special_tok, "")
    return text.strip()

In [None]:
def generate_using_synQA(context, generator, answer_phrases, num_questions):
    s_ans = []
    s_ques = []
    for i in range(min(len(answer_phrases), num_questions)):
        # if answer_phrases[i] not in context:
        #     print(f"The answer provided ({answer_phrases[i]}) is not in the context.")
        example = [answer_phrases[i], context]
        ex_input = convert_example_to_input(example)
        ex_inputs = [ex_input]
        for _ in range(1):
            output = generator.translate(ex_inputs, **decode_params)
            if isinstance(output, str):
                clean_output = clean_special_tokens(output)
            else:
                clean_output = [clean_special_tokens(q) for q in output]
                if len(clean_output) == 1:
                    clean_output = clean_output[0]
            s_ans.append(answer_phrases[i])
            s_ques.append(clean_output)
    return s_ques, s_ans

In [None]:
def generate_using_t5(context, nlp):
    s_ques = []
    s_ans = []
    result = nlp(context)
    for i in result:
        s_ques.append(i['question'])
        s_ans.append(i['answer'][6:]) # Remove the initial <pad> tag.
    return s_ques, s_ans

In [None]:
def get_answer_start(para, answer):
    para = para.lower()
    answer = answer.lower()
    return (para.find(answer) + 1)

In [None]:
def generate_qa(input_data, spc, nlp, kw_model, kw_args, generator):
    qid = 1     # Used for giving id to each question.
    output = []
    for theme in input_data:
        obj = input_data[theme]
        for id, para in obj:
            phrase_extraction = spc(para)
            if len(phrase_extraction.ents) == 0:
                answer_phrases = get_ans_phrases(kw_model, para, top_n=kw_args['top_n'], use_maxsum=kw_args['use_maxsum'], diversity=kw_args['diversity'], nr_candidates=kw_args['nr_candidates'])
                ques, ans = generate_using_synQA(para, generator, answer_phrases, num_questions = 5)
            else:
                ques, ans = generate_using_t5(para, nlp)
            for i in zip(ques, ans):
                row = [qid, theme, para, i[0], 'TRUE', [i[1]], [get_answer_start(para, i[1])]]        
                output.append(row)
                qid += 1
    df = pd.DataFrame(output, columns=['id', 'Theme', 'Paragraph', 'Question', 'Answer_possible', 'Answer_text', 'Answer_start'])
    return df

## Main

In [None]:
#@title Load all Models
kw_model = KeyBERT()

MODELS_DIR = "/content/gdrive/MyDrive/Colab Notebooks/synthetic_data/models"
MODEL_URL = {"generator_qa_squad_plus_adversarialqa.tgz": "https://dl.fbaipublicfiles.com/dynabench/qa/qgen_dcombined_plus_squad_10k.tgz"}
MODEL_NAME = 'generator_qa_squad_plus_adversarialqa'
MODEL_PATH = os.path.join(MODELS_DIR, MODEL_NAME)
download_and_extract_synQA(MODEL_URL, MODELS_DIR)

generator = TransformerModel.from_pretrained(
    MODEL_PATH,
    checkpoint_file='checkpoint_best.pt',
    bpe='gpt2',
    fp16=True,
)

nlp = pipeline("question-generation", model="valhalla/t5-small-qg-prepend", qg_format="prepend")
spc = spacy.load('en_core_web_sm')

In [None]:
#@title Set Hyperparameters for synQA Model

beam = 10                   #@param {type:'number'}
do_sampling = True          #@param ["True", "False"]
sampling_topp = 0.9         #@param {type:'number'}

decode_params = {
    'beam': beam,
    'sampling': do_sampling, 
    'sampling_topp': sampling_topp
}

SPECIAL_TOKENS = {
    'bos_token': '<s>',
    'eos_token': '</s>',
    'sep_token': '</s>'
}

In [None]:
#@title Subset Input Data

paragraphs = json.loads(pd.read_csv("paragraphs.csv").to_json(orient="records"))
theme_wise_para = test_to_theme_wise(paragraphs)

random.seed(10)
keys_available = len(theme_wise_para.keys())

num_keys = 3       #@param {type:'number'}
num_paras = 3      #@param {type:'number'}

keys = random.sample(theme_wise_para.keys(),num_keys)
input_data = {k:theme_wise_para[k][:num_paras] for k in keys}

In [None]:
#@title Set Hyperparameters for KeyBert Model

kw_args = {}
kw_args['top_n'] = 10           #@param {type:'number'}
kw_args['use_maxsum'] = True    #@param ["True", "False"]
kw_args['diversity'] = 0.2      #@param {type:'number'}
kw_args['nr_candidates'] = 20   #@param {type:'number'}

In [None]:
qa_output = generate_qa(input_data, spc, nlp, kw_model, kw_args, generator)

In [None]:
qa_output.to_csv("/content/gdrive/MyDrive/Colab Notebooks/synthetic_data/synthetic_datat.csv", index=False)