In [1]:
question = 'Who is the founder of Amazon?'

In [2]:
import spacy
#!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")
doc = nlp(question)
query = ' '.join(token.text for token in doc if token.pos_ in {'PROPN', 'NUM', 'VERB', 'NOUN', 'ADJ'})
query

'is founder Amazon'

In [3]:
import requests

params = {
            'action': 'query',
            'list': 'search',
            'srsearch': query,
            'format': 'json'
         }
res = requests.get('https://en.wikipedia.org/w/api.php', params=params)
pages = res.json()

In [4]:
import concurrent.futures
import wikipedia
import re

def search_page(page_id):
    res = wikipedia.page(pageid=page_id)
    return res.content

def post_process(doc):
        pattern = '|'.join([
            '== References ==',
            '== Further reading ==',
            '== External links',
            '== See also ==',
            '== Sources ==',
            '== Notes ==',
            '== Further references ==',
            '== Footnotes ==',
            '=== Notes ===',
            '=== Sources ===',
            '=== Citations ===',
        ])
        p = re.compile(pattern)
        indices = [m.start() for m in p.finditer(doc)]
        min_idx = min(*indices, len(doc))
        return doc[:min_idx]
    
with concurrent.futures.ThreadPoolExecutor() as executor:
    process_list = [executor.submit(search_page, page['pageid']) for page in pages['query']['search']]
    docs = [post_process(p.result()) for p in process_list]

In [5]:
import itertools
from gensim.summarization.bm25 import BM25

def preprocess(doc):
    passages = [p for p in doc.split('\n') if p and not p.startswith('=')]
    return passages

passages = list(itertools.chain(*map(preprocess, docs)))
corpus = [[token.lemma_ for token in nlp(p)] for p in passages]
bm25 = BM25(corpus)

In [6]:
topn=10

tokens = [token.lemma_ for token in nlp(question)]
scores = bm25.get_scores(tokens)
pairs = [(s, i) for i, s in enumerate(scores)]
pairs.sort(reverse=True)
passages = [passages[i] for _, i in pairs[:topn]]

In [7]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, QuestionAnsweringPipeline
import operator
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased-distilled-squad')
model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad')
qa_pipeline = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer)

In [8]:
answers = []
for passage in passages:
    answer = qa_pipeline(question=question, context=passage)
    answer['text'] = passage
    answers.append(answer)
answers.sort(key=operator.itemgetter('score'), reverse=True)

In [9]:
print(answers[0])

{'score': 0.9968113408074686, 'start': 576, 'end': 586, 'answer': 'Jeff Bezos', 'text': "Amazon Go is a chain of convenience stores in the United States operated by the online retailer Amazon. It has 26 open and announced store locations in Seattle, Chicago, San Francisco and New York City, as of 2020.The stores are partially automated, with customers able to purchase products without being checked out by a cashier or using a self-checkout station. Amazon Go stores were conceptualized and tested by a team of Amazon executives, who constructed a 15,000-square-foot mock supermarket in a rented warehouse in Seattle, before revealing the work to Amazon founder Jeff Bezos in 2015. The first store, located in the company's Day 1 building, opened to employees on December 5, 2016, and to the public on January 22, 2018. The flagship store has prepared foods, meal kits, limited groceries, and liquor available for purchase. A larger variant, Amazon Go Grocery, opened in Seattle's Capitol Hill nei

In [10]:
answers[0]['score']

0.9968113408074686

In [11]:
answers[0]['answer']

'Jeff Bezos'

In [12]:
answers[0]['text']

"Amazon Go is a chain of convenience stores in the United States operated by the online retailer Amazon. It has 26 open and announced store locations in Seattle, Chicago, San Francisco and New York City, as of 2020.The stores are partially automated, with customers able to purchase products without being checked out by a cashier or using a self-checkout station. Amazon Go stores were conceptualized and tested by a team of Amazon executives, who constructed a 15,000-square-foot mock supermarket in a rented warehouse in Seattle, before revealing the work to Amazon founder Jeff Bezos in 2015. The first store, located in the company's Day 1 building, opened to employees on December 5, 2016, and to the public on January 22, 2018. The flagship store has prepared foods, meal kits, limited groceries, and liquor available for purchase. A larger variant, Amazon Go Grocery, opened in Seattle's Capitol Hill neighborhood on February 25, 2020. In March 2020, a proprietary Just Walk Out system was ad