In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import math
import numpy as np
from multiprocessing import Pool, cpu_count

"""
All of these algorithms have been taken from the paper:
Trotmam et al, Improvements to BM25 and Language Models Examined
Here we implement all the BM25 variations mentioned. 
"""


class BM25:
    def __init__(self, corpus, tokenizer=None):
        self.corpus_size = len(corpus)
        self.avgdl = 0
        self.doc_freqs = []
        self.idf = {}
        self.doc_len = []
        self.tokenizer = tokenizer

        if tokenizer:
            corpus = self._tokenize_corpus(corpus)

        nd = self._initialize(corpus)
        self._calc_idf(nd)

    def _initialize(self, corpus):
        nd = {}  # word -> number of documents with word
        num_doc = 0
        for document in corpus:
            self.doc_len.append(len(document))
            num_doc += len(document)

            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.doc_freqs.append(frequencies)

            for word, freq in frequencies.items():
                try:
                    nd[word]+=1
                except KeyError:
                    nd[word] = 1

        self.avgdl = num_doc / self.corpus_size
        return nd

    def _tokenize_corpus(self, corpus):
        pool = Pool(cpu_count())
        tokenized_corpus = pool.map(self.tokenizer, corpus)
        return tokenized_corpus

    def _calc_idf(self, nd):
        raise NotImplementedError()

    def get_scores(self, query):
        raise NotImplementedError()

    def get_batch_scores(self, query, doc_ids):
        raise NotImplementedError()

    def get_top_n(self, query, documents, n=5):

        assert self.corpus_size == len(documents), "The documents given don't match the index corpus!"

        scores = self.get_scores(query)
        top_n = np.argsort(scores)[::-1][:n]
        return [documents[i] for i in top_n]


class BM25Okapi(BM25):
    def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, epsilon=0.25):
        self.k1 = k1
        self.b = b
        self.epsilon = epsilon
        super().__init__(corpus, tokenizer)

    def _calc_idf(self, nd):
        """
        Calculates frequencies of terms in documents and in corpus.
        This algorithm sets a floor on the idf values to eps * average_idf
        """
        # collect idf sum to calculate an average idf for epsilon value
        idf_sum = 0
        # collect words with negative idf to set them a special epsilon value.
        # idf can be negative if word is contained in more than half of documents
        negative_idfs = []
        for word, freq in nd.items():
            idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
            self.idf[word] = idf
            idf_sum += idf
            if idf < 0:
                negative_idfs.append(word)
        self.average_idf = idf_sum / len(self.idf)

        eps = self.epsilon * self.average_idf
        for word in negative_idfs:
            self.idf[word] = eps

    def get_scores(self, query):
        """
        The ATIRE BM25 variant uses an idf function which uses a log(idf) score. To prevent negative idf scores,
        this algorithm also adds a floor to the idf value of epsilon.
        See [Trotman, A., X. Jia, M. Crane, Towards an Efficient and Effective Search Engine] for more info
        :param query:
        :return:
        """
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
        return score

    def get_batch_scores(self, query, doc_ids):
        """
        Calculate bm25 scores between query and subset of all docs
        """
        assert all(di < len(self.doc_freqs) for di in doc_ids)
        score = np.zeros(len(doc_ids))
        doc_len = np.array(self.doc_len)[doc_ids]
        for q in query:
            q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
            score += (self.idf.get(q) or 0) * (q_freq * (self.k1 + 1) /
                                               (q_freq + self.k1 * (1 - self.b + self.b * doc_len / self.avgdl)))
        return score.tolist()


class BM25L(BM25):
    def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, delta=0.5):
        # Algorithm specific parameters
        self.k1 = k1
        self.b = b
        self.delta = delta
        super().__init__(corpus, tokenizer)

    def _calc_idf(self, nd):
        for word, freq in nd.items():
            idf = math.log(self.corpus_size + 1) - math.log(freq + 0.5)
            self.idf[word] = idf

    def get_scores(self, query):
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            ctd = q_freq / (1 - self.b + self.b * doc_len / self.avgdl)
            score += (self.idf.get(q) or 0) * q_freq * (self.k1 + 1) * (ctd + self.delta) / \
                     (self.k1 + ctd + self.delta)
        return score

    def get_batch_scores(self, query, doc_ids):
        """
        Calculate bm25 scores between query and subset of all docs
        """
        assert all(di < len(self.doc_freqs) for di in doc_ids)
        score = np.zeros(len(doc_ids))
        doc_len = np.array(self.doc_len)[doc_ids]
        for q in query:
            q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
            ctd = q_freq / (1 - self.b + self.b * doc_len / self.avgdl)
            score += (self.idf.get(q) or 0) * q_freq * (self.k1 + 1) * (ctd + self.delta) / \
                     (self.k1 + ctd + self.delta)
        return score.tolist()


class BM25Plus(BM25):
    def __init__(self, corpus, tokenizer=None, k1=1.5, b=0.75, delta=1):
        # Algorithm specific parameters
        self.k1 = k1
        self.b = b
        self.delta = delta
        super().__init__(corpus, tokenizer)

    def _calc_idf(self, nd):
        for word, freq in nd.items():
            idf = math.log((self.corpus_size + 1) / freq)
            self.idf[word] = idf

    def get_scores(self, query):
        score = np.zeros(self.corpus_size)
        doc_len = np.array(self.doc_len)
        for q in query:
            q_freq = np.array([(doc.get(q) or 0) for doc in self.doc_freqs])
            score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
                                               (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
        return score

    def get_batch_scores(self, query, doc_ids):
        """
        Calculate bm25 scores between query and subset of all docs
        """
        assert all(di < len(self.doc_freqs) for di in doc_ids)
        score = np.zeros(len(doc_ids))
        doc_len = np.array(self.doc_len)[doc_ids]
        for q in query:
            q_freq = np.array([(self.doc_freqs[di].get(q) or 0) for di in doc_ids])
            score += (self.idf.get(q) or 0) * (self.delta + (q_freq * (self.k1 + 1)) /
                                               (self.k1 * (1 - self.b + self.b * doc_len / self.avgdl) + q_freq))
        return score.tolist()

In [None]:
import json

def tokenize(n_gram, sentence):
    sentence = sentence.split()
    to_ret = []
    for i in range(len(sentence)-n_gram):
        to_ret.append(" ".join(sentence[i:i+n_gram]))
    return to_ret
def multitokenize(n_grams, sentence):
    sentence = sentence.split()
    to_ret = []
    for n_gram in n_grams:
        for i in range(len(sentence)-n_gram):
            to_ret.append(" ".join(sentence[i:i+n_gram]))

    return to_ret

docs_list = []
keep_track = {}
corpus_label = []
data_path = "/content/drive/MyDrive/Zalo 2021/legal_corpus.json"
with open(data_path) as json_file:
    data = json.load(json_file)


for passage in data:
    for article in passage['articles']:
        docs_list.append(multitokenize([1], article['title']+article['text']))
        keep_track[passage['law_id'] + "###" + article['article_id']] = article
        corpus_label.append(passage['law_id'] + "###" + article['article_id'])

bm25 = BM25Plus(docs_list)


In [None]:
def test_bm25(bm25, corpus, questions, top_k):
    count = 0
    true_positive = 0
    for question in questions:
        if count % 50 == 0:
            print("{} complete".format(count))
        count += 1
        ques = question[0]
        label = question[1]
        # print(ques)
        top_k_docs = bm25.get_top_n(multitokenize([1], ques), corpus, n=top_k)
        for doc in top_k_docs:
            if doc == label:
                true_positive += 1
    return true_positive / len(questions)

In [None]:
questions = []
data_path = "/content/drive/MyDrive/Zalo 2021/train_question_answer.json"
with open(data_path) as json_file:
    questions_json = json.load(json_file)
for question in questions_json['items']:
    label = question["relevant_articles"][0]['law_id'] + "###" + question["relevant_articles"][0]['article_id']
    q= question['question']
    questions.append([q, label])

In [None]:
# questions = questions[:200]
test_bm25(bm25, corpus_label, questions, 5)

0 complete
50 complete
100 complete
150 complete
200 complete
250 complete
300 complete
350 complete
400 complete
450 complete
500 complete
550 complete
600 complete
650 complete
700 complete
750 complete
800 complete
850 complete
900 complete
950 complete
1000 complete
1050 complete
1100 complete
1150 complete
1200 complete
1250 complete
1300 complete
1350 complete
1400 complete
1450 complete
1500 complete
1550 complete
1600 complete
1650 complete
1700 complete
1750 complete
1800 complete
1850 complete
1900 complete
1950 complete
2000 complete
2050 complete
2100 complete
2150 complete
2200 complete
2250 complete
2300 complete
2350 complete
2400 complete


# **Extracting Training Data**

In [None]:
def extract_top_k_for_training(bm25, corpus, question, top_k):
    ques = question
    # print(ques)
    top_k_docs = bm25.get_top_n(multitokenize([1], ques), corpus, n=top_k)
    to_ret_list = []
    for score in top_k_docs:
        article = {}
        article['law_id'] = score.split("###")[0]
        article['article_id'] = score.split("###")[1]
        article['title'] = keep_track[score]['title']
        article['text'] = keep_track[score]['text']
        to_ret_list.append(article)

    return to_ret_list

In [None]:
import json
data_path = '/content/drive/MyDrive/Zalo 2021/train_question_answer.json'
with open(data_path) as json_file:
    questions = json.load(json_file)
count_ques = 0
for question in questions['items']:
    count_ques += 1
print(count_ques)
len(questions['items'])

3196


3196

In [None]:
data_path = '/content/drive/MyDrive/Zalo 2021/legal_corpus.json'
with open(data_path) as json_file:
    corpus = json.load(json_file)
import random
to_save = []
count = 0
false_ques_count, true_ques_count = 0, 0
for question in questions['items']:
    if count %50 == 0:
        print("{} completed".format(count))
    count += 1
    for article in corpus:
        if article['law_id'] == question['relevant_articles'][0]['law_id']:
            for passage in article['articles']:
                false_ques = {}
                if passage['article_id'] == question['relevant_articles'][0]['article_id']:
                    if len(passage['text'].split()) < 400:
                        true_ques_count += 1
                                                                            
                        true_ques = {}
                        true_ques['question'] = question['question']
                        true_ques['article'] = passage
                        true_ques['label'] = 1
                        to_save.append(true_ques)
                        break
    
    negative_sample_list = extract_top_k_for_testing(bm25, corpus_label, question['question'], 10)
    for article in negative_sample_list:
        if article['article_id'] != question['relevant_articles'][0]['article_id'] or article['law_id'] != question['relevant_articles'][0]['law_id']:
            if random.random() < 0.2:
                if len(article['text'].split()) < 400:
                    false_ques_count += 1
                    save_article ={}
                    false_ques = {}
                    false_ques['question'] = question['question']
                    false_ques['article'] = article
                    false_ques['label'] = 0
                    to_save.append(false_ques)
                
print(true_ques_count, false_ques_count)

In [None]:
import codecs
import os
dest_path = '/content/drive/MyDrive/Zalo 2021/Training Data'
today = 'training_draft_bm25_vers_short.json'

file_path = os.path.join(dest_path, today)
with codecs.open(file_path, "w", encoding='utf8') as outfile:
        json.dump(to_save , outfile, ensure_ascii=False)

# **Public Test**

In [None]:
def extract_top_k_for_testing(bm25, corpus, question, top_k):
    ques = question
    # print(ques)
    top_k_docs = bm25.get_top_n(multitokenize([1], ques), corpus, n=top_k)
    to_ret_list = []
    for score in top_k_docs:
        if len(keep_track[score]['text'].split()) < 400: 
            article = {}
            article['law_id'] = score.split("###")[0]
            article['article_id'] = score.split("###")[1]
            article['title'] = keep_track[score]['title']
            article['text'] = keep_track[score]['text']
            to_ret_list.append(article)
        else:
            para_list = [multitokenize([1], para) for para in keep_track[score]['text'].split('\n')]
            smallbm25 = BM25Plus(para_list)
            top_paras = smallbm25.get_top_n(multitokenize([1], ques), para_list, n=15)
            final_article = "\n".join([" ".join(para) for para in top_paras])
            article = {}
            article['law_id'] = score.split("###")[0]
            article['article_id'] = score.split("###")[1]
            article['title'] = keep_track[score]['title']
            article['text'] = final_article
            to_ret_list.append(article)


    return to_ret_list

In [None]:
import json
test = []
data_path = "/content/drive/MyDrive/Zalo 2021/public_test_question.json"
with open(data_path) as json_file:
    questions = json.load(json_file)
for question in questions['items']:
    id = question['question_id']
    q= question['question']
    test.append([id, q])

In [None]:
to_save = []
count = 0
for question in test:
    if count %50 == 0:
        print(count)
    count += 1
    
    k_docs = extract_top_k_for_testing(bm25, corpus_label, question[1], 10)
    question_dict = {}
    question_dict['question'] = question[1]
    question_dict['id'] = question[0]
    question_dict['relevant_articles'] = k_docs

    to_save.append(question_dict)

0
50
100
150
200
250
300
350
400
450
500


In [None]:
import codecs
import os
dest_path = '/content/drive/MyDrive/Zalo 2021/Training Data'
today = 'testing_bm25_top10_vers_short.json'

file_path = os.path.join(dest_path, today)
with codecs.open(file_path, "w", encoding='utf8') as outfile:
        json.dump(to_save , outfile, ensure_ascii=False)