#Package Imports for Question Generation Models

In [None]:
!pip install transformers
!pip install fairseq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m89.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0
Looking in indexes: https://pypi.org/simple, https://us

In [None]:
import requests
import tarfile
from tqdm import tqdm
import argparse
import logging
import os
import time
import warnings
import csv
from fairseq.models.transformer import TransformerModel



In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Download Question Generation Model

Reference: [Github](https://github.com/maxbartolo/synQA-question-generators), [Paper](https://www.maxbartolo.com/publication/2021-improving-robustness/).

In [None]:
MODELS_DIR = "/content/gdrive/MyDrive/Colab Notebooks/synthetic_data/models"
MODEL_URL = {"generator_qa_squad_plus_adversarialqa.tgz": "https://dl.fbaipublicfiles.com/dynabench/qa/qgen_dcombined_plus_squad_10k.tgz"}

In [None]:
def download(url: str, fname: str, desc: str = None) -> None:
    """Download with progress bar."""
    desc = desc if desc is not None else fname
    resp = requests.get(url, stream=True)
    total = int(resp.headers.get("content-length", 0))
    with open(fname, "wb") as file, tqdm(
        desc=fname, total=total, unit="iB", unit_scale=True, unit_divisor=1024
    ) as bar:
        for data in resp.iter_content(chunk_size=1024):
            size = file.write(data)
            bar.update(size)

In [None]:
for model_filename, url in MODEL_URL.items():
    model_name = model_filename.split(".")[0]
    model_tarfile_path = os.path.join(MODELS_DIR, model_filename)
    model_dir = os.path.join(MODELS_DIR, model_name)

    if not os.path.exists(os.path.join(model_dir, "checkpoint_best.pt")):
        if not os.path.exists(model_tarfile_path):
            download(url, model_tarfile_path, url)
        else:
            logging.info(
                f"Skipping download. The file {model_tarfile_path} already exists."
            )

        # Extract
        logging.info(f"Extracting {model_filename} to {model_dir}")
        with tarfile.open(model_tarfile_path) as f:
            # Get only the members with extensions (i.e. no directories)
            members = [
                m
                for m in f.getmembers()
                if os.path.splitext(os.path.join(model_dir, m.name))[-1]
            ]
            # Flatten (i.e. remove directory info)
            for m in members:
                m.name = os.path.basename(m.name)
            # Extract
            def is_within_directory(directory, target):
                abs_directory = os.path.abspath(directory)
                abs_target = os.path.abspath(target)
                prefix = os.path.commonprefix([abs_directory, abs_target])
                return prefix == abs_directory

            def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
                for member in tar.getmembers():
                    member_path = os.path.join(path, member.name)
                    if not is_within_directory(path, member_path):
                        raise Exception("Attempted Path Traversal in Tar File")
                tar.extractall(path, members, numeric_owner=numeric_owner)

            safe_extract(f, model_dir, members=members)

        # Remove tarfile
        logging.info(f"Deleting {model_tarfile_path}")
        os.remove(model_tarfile_path)
        logging.info(f"Processing {model_filename} complete")
    else:
        logging.info(f"Skipping {model_name} as this model is already downloaded.")

#Load the Question Generation Model

In [None]:
MODEL_NAME = 'generator_qa_squad_plus_adversarialqa'
MODEL_PATH = os.path.join(MODELS_DIR, MODEL_NAME)

SPECIAL_TOKENS = {
    'bos_token': '<s>',
    'eos_token': '</s>',
    'sep_token': '</s>'
}

In [None]:
def convert_example_to_input(example):
    ex_input_inner = f" {SPECIAL_TOKENS['sep_token']} ".join(example)
    ex_input = (
        f"{SPECIAL_TOKENS['bos_token']} {ex_input_inner} {SPECIAL_TOKENS['eos_token']}"
    )
    return ex_input

In [None]:
def clean_special_tokens(text):
    for _, special_tok in SPECIAL_TOKENS.items():
        text = text.replace(special_tok, "")
    return text.strip()

In [None]:
# Load the model
generator = TransformerModel.from_pretrained(
    MODEL_PATH,
    checkpoint_file='checkpoint_best.pt',
    bpe='gpt2',
    fp16=True,
)

1042301B [00:00, 1812352.31B/s]
456318B [00:00, 962397.09B/s]


# Generate Answers (KeyPhrases) from Context

Themes Chosen for Comparison: Premier_League, Adolescence, Frédéric_Chopin, Modern_History

In [None]:
contexts = [
    "The competition formed as the FA Premier League on 20 February 1992 following the decision of clubs in the Football League First Division to break away from the Football League, which was originally founded in 1888, and take advantage of a lucrative television rights deal. The deal was worth £1 billion a year domestically as of 2013–14, with BSkyB and BT Group securing the domestic rights to broadcast 116 and 38 games respectively. The league generates €2.2 billion per year in domestic and international television rights. In 2014/15, teams were apportioned revenues of £1.6 billion.",
    "Puberty occurs through a long process and begins with a surge in hormone production, which in turn causes a number of physical changes. It is the stage of life characterized by the appearance and development of secondary sex characteristics (for example, a deeper voice and larger adam's apple in boys, and development of breasts and more curved and prominent hips in girls) and a strong shift in hormonal balance towards an adult state. This is triggered by the pituitary gland, which secretes a surge of hormonal agents into the blood stream, initiating a chain reaction to occur. The male and female gonads are subsequently activated, which puts them into a state of rapid growth and development; the triggered gonads now commence the mass production of the necessary chemicals. The testes primarily release testosterone, and the ovaries predominantly dispense estrogen. The production of these hormones increases gradually until sexual maturation is met. Some boys may develop gynecomastia due to an imbalance of sex hormones, tissue responsiveness or obesity.",
    "Frédéric François Chopin (/\ˈʃoʊpæn/; French pronunciation: [fʁe.de.ʁik fʁɑ̃.swa ʃɔ.pɛ̃]; 22 February or 1 March 1810 – 17 October 1849), born Fryderyk Franciszek Chopin,[n 1] was a Polish and French (by citizenship and birth of father) composer and a virtuoso pianist of the Romantic era, who wrote primarily for the solo piano. He gained and has maintained renown worldwide as one of the leading musicians of his era, whose \"poetic genius was based on a professional technique that was without equal in his generation.\" Chopin was born in what was then the Duchy of Warsaw, and grew up in Warsaw, which after 1815 became part of Congress Poland. A child prodigy, he completed his musical education and composed his earlier works in Warsaw before leaving Poland at the age of 20, less than a month before the outbreak of the November 1830 Uprising.",
    "In the Pre-Modern era, many people\'s sense of self and purpose was often expressed via a faith in some form of deity, be that in a single God or in many gods. Pre-modern cultures have not been thought of creating a sense of distinct individuality, though. Religious officials, who often held positions of power, were the spiritual intermediaries to the common person. It was only through these intermediaries that the general masses had access to the divine. Tradition was sacred to ancient cultures and was unchanging and the social order of ceremony and morals in a culture could be strictly enforced.",
    "Beyoncé\'s first solo recording was a feature on Jay Z\'s \"\'03 Bonnie & Clyde\" that was released in October 2002, peaking at number four on the U.S. Billboard Hot 100 chart. Her first solo album Dangerously in Love was released on June 24, 2003, after Michelle Williams and Kelly Rowland had released their solo efforts. The album sold 317,000 copies in its first week, debuted atop the Billboard 200, and has since sold 11 million copies worldwide. The album\'s lead single, \"Crazy in Love\", featuring Jay Z, became Beyoncé\'s first number-one single as a solo artist in the US. The single \"Baby Boy\" also reached number one, and singles, \"Me, Myself and I\" and \"Naughty Girl\", both reached the top-five. The album earned Beyoncé a then record-tying five awards at the 46th Annual Grammy Awards; Best Contemporary R&B Album, Best Female R&B Vocal Performance for \"Dangerously in Love 2\", Best R&B Song and Best Rap/Sung Collaboration for \"Crazy in Love\", and Best R&B Performance by a Duo or Group with Vocals for \"The Closer I Get to You\" with Luther Vandross.",
]
answer_phrases = {}

##Spacy with PyTextRank

Reference: [Spacy](https://spacy.io/models/en), [PyTextRank](https://spacy.io/universe/project/spacy-pytextrank)

Ranking of phrases generated using noun chunks manually not done.
* "en_core_web_sm" model has tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.
* Fails to any detect phrases when there aren't any nouns, dates or important topics. To counter this in such special cases, we perform manual noun chunking and retrieve special tags with their subtrees.
* PyTextRank helps ranking for getting most relevant answers.

In [None]:
!pip install pytextrank
import spacy
from spacy import displacy
from spacy.symbols import *
import spacy.cli
import pytextrank
spacy.cli.download("en_core_web_sm")
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("textrank")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytextrank
  Downloading pytextrank-3.2.4-py3-none-any.whl (30 kB)
Collecting graphviz>=0.13
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 KB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting icecream>=2.1
  Downloading icecream-2.1.3-py2.py3-none-any.whl (8.4 kB)
Collecting pygments>=2.7.4
  Downloading Pygments-2.14.0-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting asttokens>=2.0.1
  Downloading asttokens-2.2.1-py2.py3-none-any.whl (26 kB)
Collecting executing>=0.3.1
  Downloading executing-1.2.0-py2.py3-none-any.whl (24 kB)
Collecting scipy>=1.7
  Downloading scipy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

<pytextrank.base.BaseTextRankFactory at 0x7fce183eefd0>

In [None]:
spacy_phrases = []
for context in contexts:
    phrases=[]
    out = nlp(context)
    if len(out.ents) == 0:
      np_labels = set([nsubj, nsubjpass, dobj, nn, pobj])
      for word in out:
          if word.dep in np_labels:
              temp = [(t.text, t.tag_) for t in word.subtree]
              if len(temp) == 1:
                  if temp[0][1] == 'NNS' or temp[0][1] == 'NN':
                      phrases.append(temp[0][0])
              else:
                  temp = [t.text for t in word.subtree]
                  phrases.append(" ".join(temp))
    else:
      # displacy.render(out,style="ent",jupyter=True)
      for phrase in out._.phrases:
        if phrase.rank != 0:
          phrases.append(phrase.text)
    spacy_phrases.append(phrases)
    # print("\n")
answer_phrases.update({"spaCy": spacy_phrases})
spacy_phrases

[['a lucrative television rights deal',
  'domestic and international television rights',
  'BT Group',
  'the Football League First Division',
  'BSkyB',
  'year',
  'the domestic rights',
  'advantage',
  'the Football League',
  'clubs',
  'the FA Premier League',
  'revenues',
  'The deal',
  'teams',
  '116 and 38 games',
  'the decision',
  '20 February',
  '20 February 1992',
  'The competition',
  'The league'],
 ['Puberty',
  'a long process',
  'a surge in hormone production , which in turn causes a number of physical changes',
  'hormone production , which in turn causes a number of physical changes',
  'turn',
  'a number of physical changes',
  'physical changes',
  'life characterized by the appearance and development of secondary sex characteristics',
  'the appearance and development of secondary sex characteristics',
  'secondary sex characteristics',
  'example',
  'boys',
  'breasts',
  'girls',
  'hormonal balance',
  'an adult state',
  'the pituitary gland , which

##TextBlob

Reference: [Main Webpage](https://textblob.readthedocs.io/en/dev/)

* NLTK based slower alternative to Spacy. Provides similar features.
* Majority of top-ranked phrases are 2-3 n-grams long.
* Ranking is better than Spacy with respect to the theme.

In [None]:
!pip install nltk
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from textblob import TextBlob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
textblob_phrases = []
for i in range(len(contexts)):
  text = TextBlob(contexts[i])
  textblob_phrases.append(text.noun_phrases)
answer_phrases.update({"TextBlob": textblob_phrases})
textblob_phrases

[WordList(['fa', 'premier league', 'february', 'football league', 'football league', 'lucrative television rights', 'worth £1', 'bskyb', 'bt', 'domestic rights', 'league generates €2.2', 'international television rights']),
 WordList(['puberty', 'long process', 'hormone production', 'turn causes', 'physical changes', 'secondary sex characteristics', "adam 's apple", 'prominent hips', 'hormonal balance', 'adult state', 'pituitary gland', 'hormonal agents', 'blood stream', 'chain reaction', 'female gonads', 'rapid growth', 'mass production', 'necessary chemicals', 'release testosterone', 'dispense estrogen', 'hormones increases', 'sexual maturation', 'sex hormones', 'tissue responsiveness']),
 WordList(['frédéric françois chopin', 'french pronunciation', '[ fʁe.de.ʁik fʁɑ̃.swa ʃɔ.pɛ̃ ]', 'february', 'march', 'october', 'fryderyk franciszek chopin', '[ n', 'polish', 'virtuoso pianist', 'romantic era', 'solo piano', 'renown worldwide', 'poetic genius', 'professional technique', 'chopin', '

##Rake with NLTK

Reference: [Github](https://github.com/csurfer/rake-nltk)

* Extremely fast NER… Mostly based on word frequency (TF alone.
* Speed is at the cost of poor grammatical phrases and random segmentation in phrase creation.
* Question Generated are usually direct i.e. mostly based on longest common subsequence.

In [None]:
!pip install rake-nltk
!pip install nltk
from rake_nltk import Rake
import nltk
nltk.download('stopwords')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
r = Rake()
rake_phrases = []
for i in range(len(contexts)):
  r.extract_keywords_from_text(contexts[i])
  phrase = r.get_ranked_phrases()
  rake_phrases.append(phrase)
answer_phrases.update({"Rake": rake_phrases})
rake_phrases

[['20 february 1992 following',
  'league generates € 2',
  '2 billion per year',
  'football league first division',
  'worth £ 1 billion',
  'lucrative television rights deal',
  'international television rights',
  'fa premier league',
  'bt group securing',
  '38 games respectively',
  '2013 – 14',
  'football league',
  '£ 1',
  '6 billion',
  'year domestically',
  'domestic rights',
  'take advantage',
  'originally founded',
  'competition formed',
  'broadcast 116',
  'break away',
  'apportioned revenues',
  'deal',
  'domestic',
  'teams',
  'decision',
  'clubs',
  'bskyb',
  '2014',
  '1888',
  '15'],
 ['boys may develop gynecomastia due',
  'testes primarily release testosterone',
  'ovaries predominantly dispense estrogen',
  'secondary sex characteristics',
  'hormones increases gradually',
  'hormonal balance towards',
  'sex hormones',
  'hormonal agents',
  'turn causes',
  'tissue responsiveness',
  'subsequently activated',
  'strong shift',
  'sexual maturation',


##Yake

Reference: [Github](https://github.com/LIAAD/yake)

* Improved version of RAKE. More Gram+D6:X9matical and Semantically Correct phrases produced.
* Some important phrases where better quality questions are generated are ranked lower.
* Takes longer than Rake, but still negligible.

In [None]:
!pip install yake
import yake

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 KB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting segtok
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting jellyfish
  Downloading jellyfish-0.9.0.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 KB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: jellyfish
  Building wheel for jellyfish (setup.py) ... [?25l[?25hdone
  Created wheel for jellyfish: filename=jellyfish-0.9.0-cp38-cp38-linux_x86_64.whl size=77915 sha256=954da0b2e609fc94a34aa5f1582ca872e2fe259be72e02149052481a3391606e
  Stored in directory: /root/.cache/pip/wheels/f1/c7/3c/4c83132de76359e3a429fd09c08995945ca96c5290a41651d3
Success

In [None]:
language = "en"
max_ngram_size = 5
deduplication_threshold = 0.2
deduplication_algo = 'seqm'
windowSize = 1
numOfKeywords = 100
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)

yake_phrases = []
for i in range(len(contexts)):
    keywords = custom_kw_extractor.extract_keywords(contexts[i])
    temp = []
    for i in range(len(keywords)):
        temp.append(keywords[i][0])
    yake_phrases.append(temp)
answer_phrases.update({"Yake": yake_phrases})
yake_phrases

[['clubs in the Football',
  'Football League First Division',
  'League',
  'Premier',
  'billion',
  'BSkyB and BT Group securing',
  'originally',
  'founded',
  'domestic',
  'Group',
  'teams were apportioned revenues',
  'BSkyB',
  'advantage of a lucrative'],
 ['production of the necessary chemicals',
  'long process',
  'Puberty occurs through a long',
  'deeper voice and larger adam',
  'surge',
  'begins',
  'physical',
  'adult state',
  'chain reaction to occur',
  'sex',
  'life',
  'gland',
  'ovaries predominantly',
  'female',
  'rapid',
  'imbalance of sex'],
 ['composer and a virtuoso',
  'Fryderyk Franciszek Chopin',
  'swa ʃɔ.pɛ',
  'birth of father',
  'March',
  'February',
  'Polish',
  'maintained renown worldwide',
  'ˈʃoʊpæn',
  'fʁɑ',
  'piano',
  'professional technique',
  'November',
  'gained'],
 ['God or in many gods',
  'Pre-Modern era',
  'God',
  'creating a sense of distinct',
  'deity',
  'single',
  'cultures',
  'thought',
  'access',
  'social'],

##PKE Supervised Model Kea

Reference: [Github](https://github.com/boudinfl/pke)

* PKE provides implementation of many Unsupervised and also a  Supervised NER Models. Uses Spacy.
* Highly customizable pipeline with many ranking algorithms implemented: Multipartite rank, topic rank, position rank.
* Top ranked phrases usually tend to be shorter even if n-grams and window size are kept large, which result in generation of direct questions (longest common subsequence).

In [None]:
!pip install git+https://github.com/boudinfl/pke.git
import pke

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/boudinfl/pke.git
  Cloning https://github.com/boudinfl/pke.git to /tmp/pip-req-build-7a1zejom
  Running command git clone --filter=blob:none --quiet https://github.com/boudinfl/pke.git /tmp/pip-req-build-7a1zejom
  Resolved https://github.com/boudinfl/pke.git to commit 8f1d05dcc52041c9920ba0f9d5231fe6086d12c4
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.9/235.9 KB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pke
  Building wheel for pke (setup.py) ... [?25l[?25hdone
  Created wheel for pke: filename=pke-2.0.0-py3-none-any.whl size=6160288 sha256=f5dcaec60d73e83d1781ced23d23ecc3fa2c905b222cc373e0daa5bd8c51b0ab
  Stored in directory: /tmp/pip-ephem-wheel-cache-fle_pgz1

In [None]:
grammar="NP: {<ADJ>*<NOUN|PROPN>+}"
extractor = pke.supervised.Kea()

pke_kea_phrases = []
for i in range(len(contexts)):
    extractor.load_document(input=contexts[i], language='en')
    extractor.grammar_selection(grammar=grammar)
    extractor.candidate_weighting()
    keyphrases = extractor.get_n_best(n=20, stemming=True)
    temp = []
    for i in range(len(keyphrases)):
        temp.append(keyphrases[i][0])
    pke_kea_phrases.append(temp)
answer_phrases.update({"PKE": pke_kea_phrases})
pke_kea_phrases

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


[['fa premier leagu',
  'footbal leagu first divis',
  'lucr televis right deal',
  'februari',
  'bskyb',
  'bt group',
  'domest right',
  'club',
  'intern televis right',
  'footbal leagu',
  'leagu',
  'year',
  'competit',
  'revenu',
  'team',
  'decis',
  'advantag',
  'game',
  'deal'],
 ['boy',
  'surg',
  'puberti',
  'long process',
  'hormon product',
  'physic chang',
  'secondari sex characterist',
  'deeper voic',
  'larger adam',
  'breast',
  'promin hip',
  'girl',
  'strong shift',
  'hormon balanc',
  'adult state',
  'pituitari gland',
  'hormon agent',
  'blood stream',
  'chain reaction',
  'femal gonad'],
 ['warsaw',
  'frédéric françoi chopin',
  '/\\ˈʃoʊpæn/',
  'french pronunci',
  'ʃɔ.pɛ̃',
  'fryderyk franciszek chopin,[n',
  'februari',
  'father',
  'virtuoso pianist',
  'romant era',
  'march',
  'solo piano',
  'citizenship',
  'birth',
  'musician',
  'poetic geniu',
  'french',
  'octob',
  'profession techniqu',
  'chopin'],
 ['pre-modern era',
  'd

##Stanford 7 Class NER Tagger

Reference: [Main Webpage](https://nlp.stanford.edu/software/CRF-NER.shtml)

* Stanford provides 3-Class, 5-Class and 7-Class NER methodolgies.
* Only Keywords are detected and not Keyphrases.
* Many times, in case the paragraph contains only "dobj", "pobj" or "psubj" tags, very few or no entities are detected.
* No ranking methodologies are available in the package.

In [None]:
from nltk.tag.stanford import StanfordNERTagger
from nltk.tokenize import word_tokenize
import nltk

!wget 'https://nlp.stanford.edu/software/stanford-ner-4.2.0.zip'
!unzip stanford-ner-4.2.0.zip

nltk.download('punkt')

--2023-01-28 12:16:50--  https://nlp.stanford.edu/software/stanford-ner-4.2.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://downloads.cs.stanford.edu/nlp/software/stanford-ner-4.2.0.zip [following]
--2023-01-28 12:16:50--  https://downloads.cs.stanford.edu/nlp/software/stanford-ner-4.2.0.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 180437064 (172M) [application/zip]
Saving to: ‘stanford-ner-4.2.0.zip’


2023-01-28 12:17:21 (5.73 MB/s) - ‘stanford-ner-4.2.0.zip’ saved [180437064/180437064]

Archive:  stanford-ner-4.2.0.zip
   creating: stanford-ner-2020-11-17/
   creating: stanford-ner-2020-11-17/lib/
  inflating: stanford-ner-

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
jar = 'stanford-ner-2020-11-17/stanford-ner-4.2.0.jar'
ner_tagger_7class = StanfordNERTagger('stanford-ner-2020-11-17/classifiers/english.muc.7class.distsim.crf.ser.gz', jar, encoding = 'utf8')

stanford_phrases=[]
for i in range(len(contexts)):
    tokenized_text = word_tokenize(contexts[i])
    classified_text = ner_tagger_7class.tag(tokenized_text)
    temp = []
    for i in classified_text:
      if i[1] != 'O':
        temp.append(i[0])
    stanford_phrases.append(temp)
answer_phrases.update({"Stanford": stanford_phrases})
stanford_phrases

[['FA',
  'Premier',
  'League',
  'February',
  '1992',
  'Football',
  'League',
  'First',
  'Division',
  'Football',
  'League',
  '1888',
  '£1',
  'billion',
  'BSkyB',
  'BT',
  'Group',
  '201415'],
 ['gynecomastia'],
 ['February',
  'March',
  '1810',
  'October',
  '1849',
  'Franciszek',
  'Chopin',
  'Warsaw',
  'Warsaw',
  '1815',
  'Congress',
  'Poland',
  'Warsaw',
  'Poland',
  'November',
  '1830'],
 [],
 ['Jay',
  'Z',
  'October',
  '2002',
  'U.S',
  'June',
  '24',
  ',',
  '2003',
  'Michelle',
  'Williams',
  'Kelly',
  'Rowland',
  'Jay',
  'Z',
  'Beyoncé',
  'US',
  'Beyoncé',
  '46th',
  'Annual',
  'Grammy',
  'Awards',
  ';',
  'Best',
  'Contemporary',
  'R',
  '&',
  'B',
  'Album',
  'Best',
  'Female',
  'R',
  '&',
  'B',
  'Vocal',
  'Performance',
  'Best',
  'R',
  '&',
  'B',
  'Song',
  'Best',
  'RapSung',
  'Collaboration',
  'Best',
  'R',
  '&',
  'B',
  'Performance',
  'Luther',
  'Vandross']]

## KeyBert with KeyPhrase-Vectorizers

Reference: [KeyBert Webpage](https://maartengr.github.io/KeyBERT/), [KeyPhrase-Vectorizers](https://github.com/TimSchopf/KeyphraseVectorizers)

* Uses "attention"-based miniLM Model for encoding and accurate detection of phrases and cosine similarity to get the relevant phrases.
* KeyBERT's default vectorizer can be replaced with KeyPhrase vectorizer which greatly improves the phrase detection.
* The phrases are ranked and can be diversified using algorithms like Maximum Marginal Relevance and Max Sum Distance.
* N-grams tend to be smaller but is able to generate really good adversarial questions compared to 
* Highly customizable.
* Takes significantly longer to generate the keyphrases as it incorporated heavy bert models.

In [None]:
!pip install keyphrase-vectorizers
!pip install keybert
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
kw_model = KeyBERT()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keyphrase-vectorizers
  Downloading keyphrase_vectorizers-0.0.11-py3-none-any.whl (29 kB)
Collecting spacy-transformers>=1.1.6
  Downloading spacy_transformers-1.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (193 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.5/193.5 KB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting psutil>=5.8.0
  Downloading psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.2/280.2 KB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m52.0 MB/s[0m eta [

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keybert
  Downloading keybert-0.7.0.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.3.8
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rich>=10.4.0
  Downloading rich-13.3.1-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.0/239.0 KB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting markdown-it-py<3.0.0,>=2.1.0
  Downloading markdown_it_py-2.1.0-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.5/84.5 KB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
keyphr = kw_model.extract_keywords(docs=contexts, vectorizer=KeyphraseCountVectorizer(), top_n = 20, use_maxsum=True, diversity=0.2, nr_candidates=20)
keybert_phrases=[]
for i in range(len(keyphr)):
    temp=[]
    for j in range(len(keyphr[i])):
        temp.append(keyphr[i][j][0])
    keybert_phrases.append(temp)
answer_phrases.update({"KeyBert": keybert_phrases})
keybert_phrases

[['february',
  'decision',
  'bskyb',
  'group',
  'deal',
  'games',
  'domestic rights',
  'advantage',
  'clubs',
  'bt group',
  'year',
  'competition',
  'teams',
  'revenues',
  'league',
  'international television rights',
  'football league',
  'football league first division',
  'lucrative television rights deal',
  'fa premier league'],
 ['girls',
  'larger adam',
  'boys',
  'rapid growth',
  'breasts',
  'pituitary gland',
  'gynecomastia',
  'ovaries',
  'hormonal agents',
  'secondary sex characteristics',
  'hormonal balance',
  'gonads',
  'estrogen',
  'sexual maturation',
  'testosterone',
  'female gonads',
  'puberty',
  'hormones',
  'sex hormones',
  'hormone production'],
 ['generation',
  'romantic era',
  'father',
  'age',
  'duchy',
  'birth',
  'french',
  'french pronunciation',
  'musical education',
  'poetic genius',
  'musicians',
  'child prodigy',
  'congress poland',
  'solo piano',
  'poland',
  'composer',
  'warsaw',
  'virtuoso pianist',
  'ch

#Generate Questions for the Key Phrases Generated

Hyperparameter tuning is required for Question Generation.

In [None]:
decode_params = {
    'beam': 10, 
    'sampling': True, 
    'sampling_topp': 0.9
}

In [None]:
outputs={}
for model in answer_phrases:
    temp=[]
    for i in range(len(contexts)):
        for j in range(min(len(answer_phrases[model][i]), 25)):
            # if answer_phrases[model][i][j] not in contexts[i]:
            #     warnings.warn(f"The answer provided ({answer_phrases[model][i][j]}) is not in the context.")
            example = [answer_phrases[model][i][j], contexts[i]]
            ex_input = convert_example_to_input(example)
            ex_inputs = [ex_input]
            for _ in range(1):
              t_0 = time.time()
              output = generator.translate(ex_inputs, **decode_params)
              if isinstance(output, str):
                  clean_output = clean_special_tokens(output)
              else:
                  clean_output = [clean_special_tokens(q) for q in output]
                  if len(clean_output) == 1:
                      clean_output = clean_output[0]
              temp.append((i, clean_output, answer_phrases[model][i][j]))
              print(f"Question: {clean_output} | Answer: {answer_phrases[model][i][j]} | Time: {time.time() - t_0:.1f}s")
outputs.update({model:temp})

Question: What was the reason of the break up? | Answer: a lucrative television rights deal | Time: 11.6s
Question: What did BSkyB and BT Group get? | Answer: domestic and international television rights | Time: 14.0s
Question: Who owned the domestic rights? | Answer: BT Group | Time: 10.3s
Question: The FA Premier League was originally? | Answer: the Football League First Division | Time: 8.7s
Question: Of BSkyB and BT Group, which one was the first deal to take place? | Answer: BSkyB | Time: 11.8s
Question: In which years did BSkyB and BT Group earn rights to broadcast the FA Premier League? | Answer: year | Time: 18.0s
Question: Which of the following is not part of the FA Premier League: local rights or domestic rights? | Answer: the domestic rights | Time: 13.5s
Question: Why did clubs break away from the Football League First Division? | Answer: advantage | Time: 13.2s
Question: What was the Premier League originally named? | Answer: the Football League | Time: 9.6s
Question: The

In [None]:
outputs

{'KeyBert': [(0, 'When was the FA Premier League founded?', 'february'),
  (0,
   'What caused teams to break away from the Football League in 1992?',
   'decision'),
  (0, 'What was the first organization mentioned in the passage?', 'bskyb'),
  (0, 'What does the G in "FA Premier League" stand for?', 'group'),
  (0, 'what is the last word in the passage?', 'deal'),
  (0, 'What kind of league is the FA Premier League?', 'games'),
  (0, 'What rights were being broadcast?', 'domestic rights'),
  (0, 'What did clubs want?', 'advantage'),
  (0, 'what group is mentioned last?', 'clubs'),
  (0,
   'Which group secured the domestic rights to broadcast 116 and 38 games respectively?',
   'bt group'),
  (0, "When is the FA Premier League's deal valued at?", 'year'),
  (0, 'The FA Premier League is a?', 'competition'),
  (0, 'Which is not a last name, BSkyB or Teams?', 'teams'),
  (0, 'What did the competition make?', 'revenues'),
  (0, 'Who runs the FA Premier League?', 'league'),
  (0,
   'Wha

In [None]:
import json
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(outputs, f, ensure_ascii=False, indent=4)