In [1]:
# importing dependencies
import requests # for http request
from bs4 import BeautifulSoup # for web scraping
import re # for regex
import unicodedata # for standardising text
import pandas as pd # for data manipulation
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # for our NLP pre-trained model

# Why Google's Flan-T5

In this project, I am using Google's Flan-T5 'base' version. Flan stand for Fine-tuned LAnguage Net while T5 stands for Text-to-Text Transfer Transformer. 

Flan-T5 is a pre-trained NLP model that was trained for a variety of language tasks such as translation, summarisation and question answering (QA). Since QA is a task it was trained on, Flan-T5 is appropriate for this task as that's half of the battle done! Many of the tasks used to train Flan-T5 also heavily involve questions such as logical reasoning and verifying scientific facts. Hence, the transfer learning is effective to something as general as generating questions.

I used the base version as it's the standard version and this small-scale task does not benefit from using the Large nor XXL version.

In [2]:
def generate_qa_pairs(url):
    wiki_extract = get_wiki_extract(url) # web scrape and clean the wikipedia page
    
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base") # here we use google's flan-t5-base model
    model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
    
    qa_pairs = []
    for text in wiki_extract: # go through every paragraph of the page
        # generate question for the current text
        question = generate_question(text, tokenizer, model)
        
        # extract answer for the question using the same tokenizer and model
        answer = extract_answer(text, question, tokenizer, model)
        
        # store the question-answer pairs in a list of dictionaries
        qa_pairs.append({"question": question, "answer": answer})
    
    return qa_pairs

# helpter functions for web scraping
def get_wiki_extract(url):
    soup = get_soup(url) # get request to the wikipedia page
    headers = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] # relevant texts are enclosed within these 6 headers
    wiki_extract = []
    for tag in soup.find_all(): # we first break the text everytime it's a new section
        if tag.name in headers and tag.text != 'Contents': # ignore unnecessary elements (tables, UI elements, etc.) and wikipedia's contents box
            p = '' # we find all paragraphs after each header
            for ne in tag.next_elements: # loop through the next elements after each header
                if ne.name == 'p': # the texts are always enclosed in <p>
                    p += get_paragraph_text(ne) # extract the text
                if ne.name in headers: # stop once we find the next header
                    break
            if p: # ignore empty sections
                wiki_extract.append(clean_wiki_content(p))
    
    separated_wiki_extract = [] # split the texts further by paragraphs
    for item in wiki_extract:
        if '\n' in item: # check for breaking space then split by those
            items = item.split('\n')
            separated_wiki_extract.extend(filter(None, items)) # filter out empty strings
        else: # otherwise keep it as it is
            separated_wiki_extract.append(item)
    
    return separated_wiki_extract

def get_soup(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content , 'html.parser') # parse html content
    return soup

def get_paragraph_text(p):
    paragraph_text = ''
    for tag in p.children: # get all the texts inside the paragraph
        if tag.name == "style" or tag.text.isspace(): # wikipedia sometimes put style tag or empty texts in a <p> (must have been on a tight deadline). We don't want none of those!
            continue
        paragraph_text += tag.text # by only taking the text, we ignore the various HMTL tags suchs as <i>, <b>, <a href>, etc.
    return paragraph_text

def clean_wiki_content(text): # most of the text pre-processing is here
    text = re.sub(r"\[\d+\]|\[[a-zA-Z]\]", "", text) # remove citations
    text = text.replace("[edit]", "") # wikipedia thinks we are smart enough to contribute. have to remove these edit buttons!
    text = unicodedata.normalize("NFKD", text) # standardise the non-breaking spaces and others
    return text

# helper functions for using flan-t5
def generate_question(text, tokenizer, model):
    input_text = "generate question: " + text # this is the input we are going to plug into the model
    input_text = input_text[:512] # 512 is the character limit of this model
    input_ids = tokenizer.encode(input_text, return_tensors="pt") #tokenize first
    outputs = model.generate(input_ids, max_new_tokens = 512)
    question = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] # decode and return away!
    return question

def extract_answer(text, question, tokenizer, model): # the same exact flow as the question generation part
    input_text = question + " context: " + text # just different prompt
    input_text = input_text[:512] # again, 512 is the character limit of this model
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(input_ids, max_new_tokens = 512)
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return answer

# Calling the function

In [3]:
url = "https://en.m.wikipedia.org/wiki/The_Elder_Scrolls_V:_Skyrim"
qa_pairs_skyrim = generate_qa_pairs(url)
qa_pairs_skyrim

[{'question': 'What is the name of the game that was released worldwide for Microsoft Windows, PlayStation 3, and Xbox 360?',
  'answer': 'The Elder Scrolls V: Skyrim'},
 {'question': 'What is the name of the northernmost province of Tamriel?',
  'answer': 'Skyrim'},
 {'question': 'What is the name of the game that Skyrim was developed using?',
  'answer': 'Oblivion'},
 {'question': 'What was the name of the game that was released in June 2013?',
  'answer': 'The Elder Scrolls V: Skyrim – Legendary Edition'},
 {'question': 'What is the name of the game that is played from a first- or third-person perspective?',
  'answer': 'The Elder Scrolls V: Skyrim'},
 {'question': 'What is the primary attribute of a character?',
  'answer': 'health'},
 {'question': 'What is the name of the first game in the series?',
  'answer': 'The Elder Scrolls IV: Oblivion'},
 {'question': 'What is the name of the province that the Empire has recently fought a war with?',
  'answer': 'Skyrim'},
 {'question': 'W

# Evaluating the results

RELEVANCE

From the example of Skyrim above, we can see that the question-pairs are pretty good and produces both general ("What is the name of the game world that the team set the game in?") and detailed ("What is the name of the game that Skyrim has been credited with influencing?") questions

However,the question can be too general sometimes. There are questions like:
- "What is the name of the game that was released worldwide for Microsoft Windows, PlayStation 3, and Xbox 360?"
- "What is the name of the game that was released in June 2013?"
The model produces questions like this because it processes the texts by themseleves, in isolation from the other paragraphs and also the broader general knowledge.

CORRECTNESS

Generally the answers are correct as long as the correct answer is found nearby. Otherwise, the model seems to confuse nearby proper nouns as the answer. For example:

Question: "Who is the captured dragon?" 

Context: "The Dragonborn's allies hatch a plan to capture a dragon at Whiterun. The Dragonborn helps negotiate a truce in the civil war to prevent either side from capturing Whiterun during this delicate operation. The captured dragon, Odahviing, questions whether Alduin deserves lordship over dragons."

Since the ground truth, "Odahviing", is found 2 sentences away from the mention of the captured dragon, the model assigns higher probability to "Whiterun", a location, instead.

IMPROVEMENTS

The most immediate improvement to this model is to add the ability to take into account the whole wikipedia page as input at once before generating questions. This will encourage the model to generate more relevant questions instead of looking at each paragraph in isolation. Currently, the limit is at 512 characters and expanding it would require much more computational power.

# Convert to DataFrame for csv conversion

In [4]:
df_skyrim = pd.DataFrame(qa_pairs_skyrim)
df_skyrim

Unnamed: 0,question,answer
0,What is the name of the game that was released...,The Elder Scrolls V: Skyrim
1,What is the name of the northernmost province ...,Skyrim
2,What is the name of the game that Skyrim was d...,Oblivion
3,What was the name of the game that was release...,The Elder Scrolls V: Skyrim – Legendary Edition
4,What is the name of the game that is played fr...,The Elder Scrolls V: Skyrim
5,What is the primary attribute of a character?,health
6,What is the name of the first game in the series?,The Elder Scrolls IV: Oblivion
7,What is the name of the province that the Empi...,Skyrim
8,Who is the jarl of Windhelm?,Ulfric Stormcloak
9,Who was the last dragon?,Paarthurnax


In [5]:
df_skyrim.to_csv('qa_pairs_skyrim.csv', index=False, encoding='utf-8-sig')

# Additional results

In [6]:
url = "https://en.wikipedia.org/wiki/Singapore"
qa_pairs_sg = generate_qa_pairs(url)
qa_pairs_sg

[{'question': 'What is the largest island in Southeast Asia?',
  'answer': 'Johor'},
 {'question': 'What was the name of the British Empire that Singapore came under the direct control of?',
  'answer': 'British Empire'},
 {'question': 'What is the main idea of the passage?',
  'answer': 'globalisation'},
 {'question': 'What is the name of the country that has a Westminster system of unicameral parliamentary government?',
  'answer': 'Singapore'},
 {'question': 'What is the Malay name for the country?',
  'answer': 'Singapura'},
 {'question': 'What was the name of the Kingdom of Singapura?',
  'answer': 'Temasek'},
 {'question': 'Singapore when he arrived in what country?',
  'answer': 'Singapore'},
 {'question': 'Which happened first, the Battle of Singapore or the surrender of the British forces?',
  'answer': 'Battle of Singapore'},
 {'question': 'What happened first, the surrender of Japanese forces in the region or the surrender of General Seishir Itagaki?',
  'answer': 'surrender

In [7]:
df_sg = pd.DataFrame(qa_pairs_sg)
df_sg

Unnamed: 0,question,answer
0,What is the largest island in Southeast Asia?,Johor
1,What was the name of the British Empire that S...,British Empire
2,What is the main idea of the passage?,globalisation
3,What is the name of the country that has a Wes...,Singapore
4,What is the Malay name for the country?,Singapura
5,What was the name of the Kingdom of Singapura?,Temasek
6,Singapore when he arrived in what country?,Singapore
7,"Which happened first, the Battle of Singapore ...",Battle of Singapore
8,"What happened first, the surrender of Japanese...",surrender of Japanese forces in the region
9,What was the name of the ruling party of Malaya?,U


In [8]:
df_sg.to_csv('qa_pairs_sg.csv', index=False, encoding='utf-8-sig')