# Question Generation Using T5

Implements a Synthetic Question Generation Pipeline based on the `valhalla/t5-small-qg-prepend` model.

# Initialize Setup

In [None]:
!pip install -U transformers==3.0.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!python -m nltk.downloader punkt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
!git clone https://github.com/patil-suraj/question_generation.git

fatal: destination path 'question_generation' already exists and is not an empty directory.


# Train Data Loading

In [None]:
import csv
import requests

# load training dataset
def load_data():
    CSV_URL = 'https://drive.google.com/u/0/uc?id=1Z-yb752A3o7b9dqrGt24XU0sl53FVqya&export=download'

    with requests.Session() as s:
        download = s.get(CSV_URL)
        decoded_content = download.content.decode('utf-8')
        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        train_data = list(cr)

    print(f"Number of examples = {len(train_data)}")
    ans, noans = 0, 0
    for x in train_data:
        if x[4] == 'False':
            noans += 1
        else:
            ans += 1
    print(f"\tAnswerable questions = {ans}")
    print(f"\tNon-Answerable questions = {noans}\n")
    print("Examples:")
    for i in [0, 1000, 1300]:
        print(' | '.join(train_data[i][:2]), ' | ', train_data[i][2][:20] + '...', ' | ', ' | '.join(train_data[i][3:]))
    return train_data

In [None]:
def load_theme_wise_data(train_data):
    theme_wise_data = {}
    for x in train_data[1:]:
        if x[1] not in theme_wise_data:
            theme_wise_data[x[1]] = {
                'para': [],
                'ques': [],
                'ans': []
            }
        if x[2] not in theme_wise_data[x[1]]['para']:
            theme_wise_data[x[1]]['para'].append(x[2])
        theme_wise_data[x[1]]['ques'].append(x[3])
        # ans contains a list -> [Para_Number, Answer_possible, Answer_text, Answer_start]
        theme_wise_data[x[1]]['ans'].append([theme_wise_data[x[1]]['para'].index(x[2])] + x[4:])
    print(f'\nTotal {len(theme_wise_data)} themes present.')
    return theme_wise_data

In [None]:
train_data = load_data()
theme_wise_data = load_theme_wise_data(train_data)

Number of examples = 75056
	Answerable questions = 50126
	Non-Answerable questions = 24930

Examples:
 | Theme  |  Paragraph...  |  Question | Answer_possible | Answer_text | Answer_start
1430 | Frédéric_Chopin  |  Some modern commenta...  |  Who said Chopin's works were modeled after Bach, Beethoven, Schubert and Field? | True | ['Richard Taruskin'] | [543]
2196 | The_Legend_of_Zelda:_Twilight_Princess  |  Twilight Princess ta...  |  Who releases Bulbins from the Realm of Twilight? | False | [] | []

Total 361 themes present.


# Test Data Loading

In [None]:
!gdown 15WPYOD3ZLShFq_NRtiBHbpz3RTvc8ZWR
!gdown 15yxIF27NvEa3l12yNy6F5h8lGCJ2n7rf
!gdown 1Ilpxyj_0T-1KzQMdVSEbSmc1ybxOv69G
!gdown 1nkEDQZJY6_cAEVw3JlaKCgz0C6mDSYiv

Downloading...
From: https://drive.google.com/uc?id=15WPYOD3ZLShFq_NRtiBHbpz3RTvc8ZWR
To: /content/ground_truth.csv
100% 1.35M/1.35M [00:00<00:00, 136MB/s]
Downloading...
From: https://drive.google.com/uc?id=15yxIF27NvEa3l12yNy6F5h8lGCJ2n7rf
To: /content/input_question.csv
100% 2.89M/2.89M [00:00<00:00, 202MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Ilpxyj_0T-1KzQMdVSEbSmc1ybxOv69G
To: /content/theme_interval.csv
100% 3.21k/3.21k [00:00<00:00, 5.20MB/s]
Downloading...
From: https://drive.google.com/uc?id=1nkEDQZJY6_cAEVw3JlaKCgz0C6mDSYiv
To: /content/input_paragraph.csv
100% 3.45M/3.45M [00:00<00:00, 196MB/s]


In [None]:
import json
import pandas as pd
paragraphs = json.loads(pd.read_csv("input_paragraph.csv").to_json(orient="records"))

In [None]:
paragraphs[:5]

[{'id': 1,
  'paragraph': 'The iPod is a line of portable media players and multi-purpose pocket computers designed and marketed by Apple Inc. The first line was released on October 23, 2001, about 8½ months after iTunes (Macintosh version) was released. The most recent iPod redesigns were announced on July 15, 2015. There are three current versions of the iPod: the ultra-compact iPod Shuffle, the compact iPod Nano and the touchscreen iPod Touch.',
  'theme': 'IPod'},
 {'id': 2,
  'paragraph': 'Like other digital music players, iPods can serve as external data storage devices. Storage capacity varies by model, ranging from 2 GB for the iPod Shuffle to 128 GB for the iPod Touch (previously 160 GB for the iPod Classic, which is now discontinued).',
  'theme': 'IPod'},
 {'id': 3,
  'paragraph': "Apple's iTunes software (and other alternative software) can be used to transfer music, photos, videos, games, contact information, e-mail settings, Web bookmarks, and calendars, to the devices su

## Test Data Preprocessing

In [None]:
def test_to_theme_wise(test_paragraphs):
  theme_wise_para = {}
  for item in test_paragraphs:
    if item['theme'] not in theme_wise_para.keys():
      theme_wise_para[item['theme']] = [[item['id'],item['paragraph']],]
    else:
      theme_wise_para[item['theme']].append([item['id'],item['paragraph']])
  return theme_wise_para

theme_wise_para = test_to_theme_wise(paragraphs)

In [None]:
theme_wise_para.keys()

dict_keys(['IPod', '2008_Sichuan_earthquake', 'Wayback_Machine', 'Canadian_Armed_Forces', 'Cardinal_(Catholicism)', 'Human_Development_Index', 'Heresy', 'Warsaw_Pact', 'Materialism', 'Pub', 'Web_browser', 'Catalan_language', 'Paper', 'Adult_contemporary_music', 'Nanjing', 'Dialect', 'Southampton', 'The_Times', 'Immunology', 'Imamah_(Shia_doctrine)', 'Grape', 'United_States_dollar', 'Everton_F.C.', 'Hard_rock', 'Great_Plains', 'Biodiversity', 'Federal_Bureau_of_Investigation', 'Mary_(mother_of_Jesus)', 'Anti-aircraft_warfare', 'Sanskrit', 'Order_of_the_British_Empire', 'Elizabeth_II', 'Capital_punishment_in_the_United_States', 'Age_of_Enlightenment', 'Athanasius_of_Alexandria', 'Freemasonry', 'Montevideo', 'Poultry', 'Clothing', 'Department_store', 'Marvel_Comics', 'Alloy', 'Electric_motor', 'Nutrition', 'Chinese_characters', 'Bermuda', 'Utrecht', 'Capacitor', 'Comcast', 'Tuberculosis', 'North_Carolina', 'Heian_period', 'On_the_Origin_of_Species', 'Political_party', 'Tibet', 'Oklahoma',

# Initialize pipeline

In [None]:
%cd question_generation

/content/question_generation


In [None]:
from pipelines import pipeline

In [None]:
nlp = pipeline("question-generation", model="valhalla/t5-small-qg-prepend", qg_format="prepend")

# Subset the Data

In [None]:
print(f'Number of themes available: {len(theme_wise_para.keys())}')

Number of themes available: 116


In [None]:
#@title Set Hyperparameters
import random
random.seed(10)
keys_available = len(theme_wise_para.keys())

num_keys = 3       #@param {type:'number'}
num_paras = 3      #@param {type:'number'}

keys = random.sample(theme_wise_para.keys(),num_keys)
input_data = {k:theme_wise_para[k][:num_paras] for k in keys}

### Peeking the Context

In [None]:
for theme in input_data:
  print(f'Theme: {theme}')
  print('Paragraphs:')
  for para in input_data[theme]:
    print(f'Doc ID: {para[0]}')
    print(para[1])

Theme: Central_Intelligence_Agency
Paragraphs:
Doc ID: 2964
Unlike the Federal Bureau of Investigation (FBI), which is a domestic security service, CIA has no law enforcement function and is mainly focused on overseas intelligence gathering, with only limited domestic collection. Though it is not the only U.S. government agency specializing in HUMINT, CIA serves as the national manager for coordination and deconfliction of HUMINT activities across the entire intelligence community. Moreover, CIA is the only agency authorized by law to carry out and oversee covert action on behalf of the President, unless the President determines that another agency is better suited for carrying out such action. It can, for example, exert foreign political influence through its tactical divisions, such as the Special Activities Division.
Doc ID: 2965
The Executive Office also supports the U.S. military by providing it with information it gathers, receiving information from military intelligence organiza

# Main Loop

In [None]:
def generate_qa(input_data):
  s_ans = []
  s_ques = []
  for theme in input_data:
    obj = input_data[theme]
    for id, para in obj:
      try:
        result = nlp(para)
      except:
        pass
      q_obj = [[id,pair['question'],theme] for pair in result]
      a_obj = [[id,'True',pair['answer'].rstrip(),[para.find(pair['answer'].rstrip()),]] for pair in result]
      s_ans.extend(a_obj)
      s_ques.extend(q_obj)
  return s_ques, s_ans

In [None]:
out_ques, out_ans = generate_qa(input_data)

# Output Processing

In [None]:
# Display Pairs:
def display_pairs(s_ques,s_ans):
  print('Q: [<Para_ID>,<Question>,<Theme>]')
  print("A: [<Para_ID>,'True',<Answer String>, <Start Index>]")
  for i in range(len(s_ans)):
    print(f'Q: {s_ques[i]}\nA: {s_ans[i]}\n')

In [None]:
display_pairs(out_ques,out_ans)

Q: [<Para_ID>,<Question>,<Theme>]
A: [<Para_ID>,'True',<Answer String>, <Start Index>]
Q: [2964, 'What is the Federal Bureau of Investigation?', 'Central_Intelligence_Agency']
A: [2964, 'True', 'domestic security', [61]]

Q: [2964, 'Who is the only agency authorized by law to carry out and oversee covert action on behalf of the President?', 'Central_Intelligence_Agency']
A: [2964, 'True', 'CIA', [88]]

Q: [2964, 'Who is the only agency authorized by law to carry out and oversee covert action on behalf of the President?', 'Central_Intelligence_Agency']
A: [2964, 'True', 'CIA', [88]]

Q: [2964, 'What division can CIA exert foreign political influence through its tactical divisions?', 'Central_Intelligence_Agency']
A: [2964, 'True', 'Special Activities Division', [743]]

Q: [2965, 'How does the Executive Office support the U.S. military?', 'Central_Intelligence_Agency']
A: [2965, 'True', 'providing it with information it gathers, receiving information from military intelligence organizati