In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
import os

import math
class TfIdf:
    def __init__(self):
        self.weighted = False
        self.documents = []
        self.corpus_dict = {}

    def add_document(self, doc_name, list_of_words):
        # building a dictionary
        doc_dict = {}
        for w in list_of_words:
            doc_dict[w] = doc_dict.get(w, 0.) + 1.0
            self.corpus_dict[w] = self.corpus_dict.get(w, 0.0) + 1.0

        # normalizing the dictionary
        length = math.log(len(list_of_words)+1)


        for k in doc_dict:
            doc_dict[k] = doc_dict[k] / length

        # add the normalized document to the corpus
        self.documents.append([doc_name, doc_dict])

    def similarities(self, list_of_words):
        """Returns a list of all the [docname, similarity_score] pairs relative to a
list of words.
        """

        # building the query dictionary
        query_dict = {}
        for w in list_of_words:
            query_dict[w] = query_dict.get(w, 0.0) + 1.0

        # normalizing the query
        # length = float(len(list_of_words))
        # for k in query_dict:
        #     query_dict[k] = query_dict[k] / length

        # computing the list of similarities
        sims = []
        for doc in self.documents:
            score = 0.0
            doc_dict = doc[1]
            for k in query_dict:
                if k in doc_dict:
                    score += (query_dict[k] / self.corpus_dict[k]) + (doc_dict[k] / self.corpus_dict[k])
                    # score += query_dict[k] / self.corpus_dict[k] * (len(k.split())**2)
            sims.append([doc[0], score])

        return sims

In [None]:
import json

def tokenize(n_gram, sentence):
    sentence = sentence.split()
    to_ret = []
    for i in range(len(sentence)-n_gram):
        to_ret.append(" ".join(sentence[i:i+n_gram]))
    return to_ret
def multitokenize(n_grams, sentence):
    sentence = sentence.split()
    to_ret = []
    for n_gram in n_grams:
        for i in range(len(sentence)-n_gram):
            to_ret.append(" ".join(sentence[i:i+n_gram]))

    return to_ret
table = TfIdf()
keep_track = {}
data_path = "/content/drive/MyDrive/Zalo 2021/legal_corpus.json"
with open(data_path) as json_file:
    data = json.load(json_file)
for passage in data:
    for article in passage['articles']:
        table.add_document(passage['law_id'] + "###" + article['article_id'], multitokenize([2+3], article['title']+article['text']))
        keep_track[passage['law_id'] + "###" + article['article_id']] = article

In [None]:
# import gensim.downloader as api
# from gensim.models import TfidfModel
# from gensim.corpora import Dictionary
# from gensim import similarities

# dct = Dictionary(dataset)  # fit dictionary
# corpus = [dct.doc2bow(line) for line in dataset]  # convert corpus to BoW format
# model = TfidfModel(corpus)  # fit model
# vector = model[corpus[0]]  # apply model to the first corpus document
# index = similarities.MatrixSimilarity(model[corpus])

In [None]:
sentence = 'Công an xã xử phạt lỗi không mang bằng lái xe có đúng không'
sim_list = table.similarities(multitokenize([3], sentence))
sim_list.sort(key=lambda x:x[1], reverse = True)
print(sim_list)

[['01/2009/tt-bnn###1', 0.0], ['01/2009/tt-bnn###2', 0.0], ['01/2009/tt-bnn###3', 0.0], ['01/2009/tt-bnn###4', 0.0], ['01/2009/tt-bnn###5', 0.0], ['01/2009/tt-bnn###6', 0.0], ['01/2009/tt-bnn###7', 0.0], ['01/2009/tt-bnn###8', 0.0], ['01/2009/tt-bnn###9', 0.0], ['01/2009/tt-bnn###10', 0.0], ['01/2009/tt-bnn###11', 0.0], ['01/2009/tt-bnn###12', 0.0], ['01/2009/tt-bnn###13', 0.0], ['01/2009/tt-bnn###14', 0.0], ['01/2009/tt-bnn###15', 0.0], ['01/2009/tt-bnn###16', 0.0], ['01/2009/tt-bnn###17', 0.0], ['01/2010/tt-bng###1', 0.0], ['01/2010/tt-bng###2', 0.0], ['01/2010/tt-bng###3', 0.0], ['01/2010/tt-bng###4', 0.0], ['01/2010/tt-bng###5', 0.0], ['01/2010/tt-bng###6', 0.0], ['01/2010/tt-bng###7', 0.0], ['01/2010/tt-bng###8', 0.0], ['01/2010/tt-bng###9', 0.0], ['01/2010/tt-bng###10', 0.0], ['01/2010/tt-bng###11', 0.0], ['01/2010/tt-bng###12', 0.0], ['01/2010/tt-bng###13', 0.0], ['01/2010/tt-bng###14', 0.0], ['01/2010/tt-bng###15', 0.0], ['01/2010/tt-bng###16', 0.0], ['01/2010/tt-bng###17', 0.0

In [None]:
count = 0
for arti in sim_list:
    count += 1
    if arti[0] == '47/2011/tt-bca###7':
        print(arti)
        print(count)

['47/2011/tt-bca###7', 0.0]
45559


In [None]:
def test_tf_idf(table, questions, top_k):
    count = 0
    true_positive = 0
    for question in questions:
        if count % 50 == 0:
            print("{} complete".format(count))
        count += 1
        ques = question[0]
        label = question[1]
        # print(ques)
        sim_scores = table.similarities(multitokenize([2+3], ques))
        sim_scores.sort(key=lambda x:x[1], reverse = True)
        sim_scores = sim_scores[0:top_k+1]
        for score in sim_scores:
            if score[0] == label:
                true_positive += 1
    return true_positive / len(questions)


In [None]:
corpus = []
data_path = "/content/drive/MyDrive/Zalo 2021/train_question_answer.json"
with open(data_path) as json_file:
    questions = json.load(json_file)
for question in questions['items']:
    label = question["relevant_articles"][0]['law_id'] + "###" + question["relevant_articles"][0]['article_id']
    q= question['question']
    corpus.append([q, label])


In [None]:
# corpus = corpus[:200]

In [None]:
print(test_tf_idf(table, corpus, 10))

0 complete
50 complete
100 complete
150 complete
200 complete
250 complete
300 complete
350 complete
400 complete
450 complete
500 complete
550 complete
600 complete
650 complete
700 complete
750 complete
800 complete
850 complete
900 complete
950 complete
1000 complete
1050 complete
1100 complete
1150 complete
1200 complete
1250 complete
1300 complete
1350 complete
1400 complete
1450 complete
1500 complete
1550 complete
1600 complete
1650 complete
1700 complete
1750 complete
1800 complete
1850 complete
1900 complete
1950 complete
2000 complete
2050 complete
2100 complete
2150 complete
2200 complete
2250 complete
2300 complete
2350 complete
2400 complete
2450 complete
2500 complete
2550 complete
2600 complete
2650 complete
2700 complete
2750 complete
2800 complete
2850 complete
2900 complete
2950 complete
3000 complete
3050 complete
3100 complete
3150 complete
0.6670838548185232


In [None]:
def extract_top_k(scores, top_k):
    """
    Parameters:
        scores: List of lists [passage_id, passage_score]
        top_k: int, the number of passages to extract
    Return:
        related_passages: List of `top_k` most related passages
    """
    scores = scores.sorted(key=lambda x:x[1], reverse = True)
    top_k_passages = scores[:top_k+1]
    related_passages = [id for id,score in top_k_passages]
    return related_passages

def extract_top_k(scores, top_k):
    """
    Parameters:
        scores: List of lists [passage_id, passage_score]
        top_k: int, the number of passages to extract
    Return:
        related_passages: List of `top_k` most related passages
    """
    related_passages = []
    for num_pass in range(top_k):
        current_largest = -1
        current_passage_id = ""
        current_index = -1
        for i in range(len(scores)):
            if s[1] > current_largest:
                current_largest = s[1]
                current_index = i
                current_passage_id = s[0]
        scores.remove(scores[current_index])
        related_passages.append(current_passage_id)
    return related_passages

In [None]:
def extract_top_k(table, questions, top_k):
    count = 0
    to_ret = []
    for question in questions:
        question_dict = {}
        question_dict['question'] = question[1]
        question_dict['id'] = question[0]
        to_ret_list = []
        if count % 50 == 0:
            print("{} complete".format(count))
        count += 1
        ques = question[1]
        # print(ques)
        sim_scores = table.similarities(multitokenize(3, 4, ques))
        sim_scores.sort(key=lambda x:x[1], reverse = True)
        sim_scores = sim_scores[0:top_k+1]
        for score in sim_scores:
            article = {}
            article['law_id'] = score[0].split("###")[0]
            article['article_id'] = score[0].split("###")[1]
            article['title'] = keep_track[score[0]]['title']
            article['text'] = keep_track[score[0]]['text']
            to_ret_list.append(article)
        question_dict['relevant_articles'] = to_ret_list
        to_ret.append(question_dict)
    return to_ret

In [None]:
test = []
data_path = "/content/drive/MyDrive/Zalo 2021/public_test_question.json"
with open(data_path) as json_file:
    questions = json.load(json_file)
for question in questions['items']:
    id = question['question_id']
    q= question['question']
    test.append([id, q])

In [None]:
to_save = extract_top_k(table, test, 10)

0 complete
50 complete
100 complete
150 complete
200 complete
250 complete
300 complete
350 complete
400 complete
450 complete
500 complete


In [None]:
import codecs
import os
dest_path = '/content/drive/MyDrive/Zalo 2021/Training Data'
today = 'testing_tf_idf_34_gram_top10.json'

file_path = os.path.join(dest_path, today)
with codecs.open(file_path, "w", encoding='utf8') as outfile:
        json.dump(to_save , outfile, ensure_ascii=False)

# **Extract Training Data**

In [None]:
def extract_top_k_for_training(table, question, top_k):
    ques = question
    # print(ques)
    sim_scores = table.similarities(multitokenize(3, 4, ques))
    sim_scores.sort(key=lambda x:x[1], reverse = True)
    sim_scores = sim_scores[0:top_k+1]
    to_ret_list = []
    for score in sim_scores:
        article = {}
        article['law_id'] = score[0].split("###")[0]
        article['article_id'] = score[0].split("###")[1]
        article['title'] = keep_track[score[0]]['title']
        article['text'] = keep_track[score[0]]['text']
        to_ret_list.append(article)

    return to_ret_list

In [None]:
import json
data_path = '/content/drive/MyDrive/Zalo 2021/train_question_answer.json'
with open(data_path) as json_file:
    questions = json.load(json_file)
count_ques = 0
for question in questions['items']:
    count_ques += 1
print(count_ques)
len(questions['items'])

3196


3196

In [None]:
data_path = '/content/drive/MyDrive/Zalo 2021/legal_corpus.json'
with open(data_path) as json_file:
    corpus = json.load(json_file)

In [None]:
import random
to_save = []
count = 0
false_ques_count, true_ques_count = 0, 0
for question in questions['items']:
    if count %50 == 0:
        print("{} completed".format(count))
    count += 1
    for article in corpus:
        if article['law_id'] == question['relevant_articles'][0]['law_id']:
            for passage in article['articles']:
                false_ques = {}
                if passage['article_id'] == question['relevant_articles'][0]['article_id']:
                    true_ques_count += 1
                                                                          
                    true_ques = {}
                    true_ques['question'] = question['question']
                    true_ques['article'] = passage
                    true_ques['label'] = 1
                    to_save.append(true_ques)
                    break
    
    negative_sample_list = extract_top_k_for_training(table, question['question'], 10)
    for article in negative_sample_list:
        if article['article_id'] != question['relevant_articles'][0]['article_id'] or article['law_id'] != question['relevant_articles'][0]['law_id']:
            if random.random() < 0.25:
                false_ques_count += 1
                save_article ={}
                false_ques = {}
                false_ques['question'] = question['question']
                false_ques['article'] = article
                false_ques['label'] = 0
                to_save.append(false_ques)
                
print(true_ques_count, false_ques_count)

In [None]:
import codecs
import os
dest_path = '/content/drive/MyDrive/Zalo 2021/Training Data'
today = 'training_draft_2.json'

file_path = os.path.join(dest_path, today)
with codecs.open(file_path, "w", encoding='utf8') as outfile:
        json.dump(to_save , outfile, ensure_ascii=False)