# Install required packages

In [None]:
!pip install pyserini faiss-cpu transformers

# Download resources

Documents

In [2]:
!gdown 1UWFMJFq_N9GZgYJr_2ErFg6g8Bo9E2VQ

Downloading...
From: https://drive.google.com/uc?id=1UWFMJFq_N9GZgYJr_2ErFg6g8Bo9E2VQ
To: /content/wikipedia_20220620_cleaned.jsonl
100% 1.57G/1.57G [00:17<00:00, 88.3MB/s]


Pretrained model weight

In [3]:
!gdown 1XKcuDTaSWXIXZ_1yqQiowcrcz0QFrKEb
!unzip vi-mrc-base.zip

Downloading...
From: https://drive.google.com/uc?id=1XKcuDTaSWXIXZ_1yqQiowcrcz0QFrKEb
To: /content/vi-mrc-base.zip
100% 879M/879M [00:10<00:00, 85.5MB/s]
Archive:  vi-mrc-base.zip
  inflating: vi-mrc-base/pytorch_model.bin  
  inflating: vi-mrc-base/tokenizer_config.json  
  inflating: vi-mrc-base/config.json  
  inflating: vi-mrc-base/readme.txt  
  inflating: vi-mrc-base/special_tokens_map.json  
  inflating: vi-mrc-base/tokenizer.json  


# Import library

In [4]:
import re
from heapq import heappush, heappop
from tqdm import tqdm
import random
import pickle
import json
import os
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Preprocessing function

In [5]:
def preprocess_text(text):
  text = text.replace('BULLET::::-', '')
  text = re.sub('<.+?>', '', text)
  text = re.sub('==.+==', '', text)
  text = re.sub('[,;:\\?\\(\\)\\[\\]\\{\\}\\<>|\'"=\\-–—…/\\+\\!\\*－_]', ' ', text)
  text = re.sub('\\.+', '.', text)
  text = re.sub('\\.\s+', '. ', text)
  text = re.sub('\s*\n', '. ', text)
  text = re.sub('\s+', ' ', text)
  # text = text.lower()
  return text

In [6]:
def to_passages(text):
  text = re.sub('BULLET::::[\\-\d]+', '', text)
  text = re.sub('<.+?>', '', text)
  text = re.sub('==.+==', '', text)
  current_text = ''
  current_len = 0
  passages = []
  for p in re.split('\\.\s+', text):
    p = re.sub('\n\s*', '. ', p)
    p = p.strip()
    if p == '':
      continue
    passages.append(p + '.')
  return passages

# Open wikipedia data

In [7]:
documents = dict()

with open('wikipedia_20220620_cleaned.jsonl', 'r') as f:
  for line in f:
    data = json.loads(line)
    text = data['text']
    text = text.replace('BULLET::::-', '')
    text = re.sub('<.+?>', '', text)
    text = re.sub('==.+==', '', text)
    documents[data['id']] = (data['title'], text)

# BM25 indexing cache for document ranking

Create necessary directories, dump documents as Lucene input file

In [10]:
if not os.path.isdir('index'):
  os.mkdir('index')
if not os.path.isdir('input'):
  os.mkdir('input')
def reformat_documents():
  fdocuments = []
  for idx, (title, text) in tqdm(documents.items()):
    ftext = preprocess_text(text)
    fdocuments.append({'id': idx, 'contents': text})
  with open('input/documents.json', 'w') as f:
    json.dump(fdocuments, f)
reformat_documents()

100%|██████████| 1273469/1273469 [02:07<00:00, 9959.60it/s]


Generate Lucene index file for caching

In [11]:
!python -m pyserini.index.lucene --collection JsonCollection --input input --language vi --index index --generator DefaultLuceneDocumentGenerator --threads 1 --storePositions --storeDocvectors

2023-07-14 01:25:23,533 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2023-07-14 01:25:23,536 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2023-07-14 01:25:23,537 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: input
2023-07-14 01:25:23,537 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2023-07-14 01:25:23,537 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2023-07-14 01:25:23,538 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 1
2023-07-14 01:25:23,538 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: vi
2023-07-14 01:25:23,538 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: porter
2023-07-14 01:25:23,539 INFO  [main] index.IndexCollection (IndexCollection.java:391) - Keep stopwords? f

Initialize searcher

In [12]:
from pyserini.search.lucene import LuceneSearcher

searcher = LuceneSearcher('index')
searcher.set_bm25(0.9, 0.4)
searcher.set_language('vi')

# N-gram processor for passage ranking

In [13]:
class NGram:
  def __init__(self, text, reformat=True):
    text = preprocess_text(text)
    text = text.replace('. ', ' ')
    lTokens = []
    uTokens = []
    for w in text.split():
      if any(c.isupper() for c in w):
        uTokens.append(w)
      else:
        lTokens.append(w)
    self.uni = self.get_ngrams(lTokens, 1)
    self.bi = self.get_ngrams(lTokens, 2)
    self.tri = self.get_ngrams(lTokens, 3)

    self.Uni = self.get_ngrams(uTokens, 1)
    self.Bi = self.get_ngrams(uTokens, 2)
    self.Tri = self.get_ngrams(uTokens, 3)

  def get_ngrams(self, tokens, n=2):
    return set([tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)])

  def score(self, o):
    return len(self.uni & o.uni) + len(self.bi & o.bi) + len(self.tri & o.tri) + 2 * (len(self.Uni & o.Uni) + len(self.Bi & o.Bi) + len(self.Tri & self.Tri))

# Load QA model

In [14]:
model_path = 'vi-mrc-base'
qa_tokenizer = AutoTokenizer.from_pretrained(model_path)
qa_model = AutoModelForQuestionAnswering.from_pretrained(model_path)
device = 0 if torch.cuda.is_available() else -1
qa_pipeline = pipeline('question-answering', model=qa_model, tokenizer=qa_tokenizer, device=device)

In [None]:
sample_question = 'Sau khi qua đời, vua Lý Nhân Tông truyền ngôi cho ai'
sample_text = 'Năm 1128, Lý Nhân Tông qua đời, hưởng thọ 63 tuổi, Dương Hoán lúc đó mới 11 tuổi lên nối ngôi, tức là Lý Thần Tông'
qa_pipeline({'question': sample_question, 'context': sample_text})

## QA pipeline step-by-step

In [None]:
sample_question = 'Ai hiện là giám đốc điều hành Xiaomi'
# sample_question = 'Tổng thống đầu tiên của Mỹ là ai?'
# sample_question = 'Trong thần thoại hy lạp vị thần tình yêu có tên là gì?'
# sample_question = 'Đạo diễn phim Titanic là ai?'
# sample_question = 'Tổng thống Hoa Kỳ thứ 45 là ai'
# sample_question = 'Hiện nay ai là tổng bí thư nước Việt Nam'
# sample_question = 'Sau khi qua đời, vua Lý Nhân Tông truyền ngôi cho ai'
# sample_question = 'Thuyết tương đối đặc biệt và thuyết tương đối tổng quát là ai phát minh ra?'
# sample_question = 'Nhà thờ Đức Bà Paris được xây dựng theo lối kiến trúc nào'
# sample_question = 'Ngôi chùa đúc hoàn toàn bằng đồng ở Việt Nam'
# sample_question = 'Trương Quang Nghĩa sinh ngày bao nhiêu'

Create ngram for question

In [None]:
question_ngram = NGram(sample_question)

Rank documents using BM25

In [None]:
hits = searcher.search(preprocess_text(sample_question), k=100)
document_ranks = []
for hit in hits:
  document_ranks.append((hit.score, hit.docid))
for idx, (score, text) in enumerate(document_ranks):
  print(idx, score, documents[text][0])

Rank passages in 2 stages.

Stage 1: Rank passages in each document.

Stage 2: Rank passages across all documents.

In [None]:
passage_ranks = []
for idx, (doc_score, doc) in enumerate(document_ranks):
  doc_title, doc_content = documents[doc]
  ranks = []
  for p in to_passages(doc_content):
    pas_ngram_score = question_ngram.score(NGram(p)) + question_ngram.score(NGram(doc_title))
    ranks.append((pas_ngram_score, p))
  ranks.sort(reverse=True)
  passage_ranks += ranks[:5]
  for r in ranks[:5]:
    print(r[0], question_ngram.score(NGram(doc_title)), doc_title)
    print(r[1])
    print('\n')
passage_ranks.sort(reverse=True)
# passage_ranks[:50]

Extract answer for each passage with confident threshold, skip answer in blacklist dictionary

In [None]:
answers = {}
blacklist = set(['kiêm', 'Quỳnh'])
for p_score, p in passage_ranks[:50]:
  res = qa_pipeline({'question': sample_question, 'context': p})
  score = res['score']
  answer = res['answer']
  answer = re.sub('[,\\.\\(\\);:\"]', '', answer)
  if answer != '':
    if score < 0.1:
      continue
    if answer in blacklist:
      continue
    if 'bao nhiêu' in sample_question and not any(c.isdigit() for c in answer):
      continue
    print(score)
    print(answer)
    print(p_score)
    print(p, end='\n\n')
    if answer not in answers:
      answers[answer] = [1, score]
    else:
      answers[answer][0] += 1
      if score > answers[answer][1]:
        answers[answer][1] = score

Merge similar answers, pick best answer using majority and its confidence

In [None]:
keys = list(answers.keys())
key_set = set(keys)
key_resolve = {}
for k in keys:
  for sk in key_set:
    if k != sk and k in sk:
      key_set.remove(k)
      key_resolve[k] = sk
      break
for k, rk in key_resolve.items():
  info = answers[k]
  answers.pop(k)
  answers[rk][0] += info[0]
  if info[1] > answers[rk][1]:
    answers[rk][1] = info[1]
best_answer = None
for k, v in answers.items():
  print(k, v)
  if best_answer is None:
    best_answer = k
    continue
  info = answers[best_answer]
  if tuple(info) < tuple(v):
    best_answer = k
best_answer

# Single function for QA pipeline

In [18]:
def extract_answer(documents, qa_pipeline, question, verbose=False):
  question_ngram = NGram(question)
  if verbose:
    print(f'Question: {question}')

  hits = searcher.search(preprocess_text(question), k=100)
  document_ranks = []
  for hit in hits:
    document_ranks.append((hit.score, hit.docid))
  if verbose:
    for idx, (score, text) in enumerate(document_ranks):
      print(idx, score, documents[text][0])
    print()

  # Stage: Retrieve answers, omit answers with low confidence
  passage_ranks = []
  for idx, (doc_score, doc) in enumerate(document_ranks):
    doc_title, doc_content = documents[doc]
    ranks = []
    for p in to_passages(doc_content):
      pas_ngram_score = question_ngram.score(NGram(p)) + question_ngram.score(NGram(doc_title))
      ranks.append((pas_ngram_score, p))
    ranks.sort(reverse=True)
    passage_ranks += ranks[:5]
    if verbose:
      for r in ranks[:5]:
        print(r[0], question_ngram.score(NGram(doc_title)), doc_title)
        print(r[1])
  if verbose:
    print('\n')
  passage_ranks.sort(reverse=True)

  # Stage: Majority vote for finding best answer
  answers = {}
  blacklist = set(['kiêm', 'Quỳnh'])
  for p_score, p in passage_ranks[:50]:
    res = qa_pipeline({'question': question, 'context': p})
    score = res['score']
    answer = res['answer']
    answer = re.sub('[,\\.\\(\\);:\"]', '', answer)
    if answer != '':
      if score < 0.1:
        continue
      if answer in blacklist:
        continue
      if 'bao nhiêu' in question and not any(c.isdigit() for c in answer):
        continue
      if verbose:
        print(score)
        print(answer)
        print(p_score)
        print(p, end='\n\n')
      if answer not in answers:
        answers[answer] = [1, score]
      else:
        answers[answer][0] += 1
        if score > answers[answer][1]:
          answers[answer][1] = score

  keys = list(answers.keys())
  key_set = set(keys)
  key_resolve = {}
  for k in keys:
    for sk in key_set:
      if k != sk and k in sk:
        key_set.remove(k)
        key_resolve[k] = sk
        break
  for k, rk in key_resolve.items():
    info = answers[k]
    answers.pop(k)
    answers[rk][0] += info[0]
    if info[1] > answers[rk][1]:
      answers[rk][1] = info[1]
  best_answer = None
  for k, v in answers.items():
    if verbose:
      print(k, v)
    if best_answer is None:
      best_answer = k
      continue
    info = answers[best_answer]
    if tuple(info) < tuple(v):
      best_answer = k
  return best_answer

In [20]:
question = 'Ai hiện là giám đốc điều hành Xiaomi'
# question = 'Tổng thống đầu tiên của Mỹ là ai?'
# question = 'Trong thần thoại hy lạp vị thần tình yêu có tên là gì?'
# question = 'Đạo diễn phim Titanic là ai?'
# question = 'Tổng thống Hoa Kỳ thứ 45 là ai'
# question = 'Hiện nay ai là tổng bí thư nước Việt Nam'
# question = 'Sau khi qua đời, vua Lý Nhân Tông truyền ngôi cho ai'
# question = 'Thuyết tương đối đặc biệt và thuyết tương đối tổng quát là ai phát minh ra?'
# question = 'Nhà thờ Đức Bà Paris được xây dựng theo lối kiến trúc nào'
# question = 'Ngôi chùa đúc hoàn toàn bằng đồng ở Việt Nam'
# question = 'Trụ sở chính của Google tên là gì'
# question = 'Trương Quang Nghĩa sinh ngày bao nhiêu'
# question = 'Huyện Mèo Vạc thuộc tỉnh nào của nước ta'

In [None]:
extract_answer(documents, qa_pipeline, question, verbose=False)

# Test QA system

Load test data

In [None]:
tests = []
with open('zac2022_testa_sample_submission.json', 'r') as f:
  test_data = json.load(f)
  for tc in test_data['data']:
    tests.append((tc['question'], tc['answer']))

Run test

In [None]:
test_answers = []
for idx, (q, a) in enumerate(tests):
  p = extract_answer(documents, qa_pipeline, q)
  test_answers.append(p)
  print(f'Test {idx + 1}')
  print(f'Question: {q}')
  print(f'Answer:   {a}')
  print(f'Predict:  {p}')
  print()

Save test result into file

In [None]:
test_result = []
for t, p in zip(tests, test_answers):
  test_result.append({'question': t[0], 'truth': t[1], 'answer': p})
with open('result.json', 'w', encoding='UTF-8') as f:
  json.dump(test_result, f, ensure_ascii=False, indent = 2)