In [None]:
import MeCab
import json
import sentencepiece as sp
import os
import collections
import unicodedata
from transformers import WordpieceTokenizer
from tqdm.notebook import tqdm

In [None]:
root_dir = os.path.abspath(os.path.join(os.getcwd(),os.pardir,os.pardir))


In [None]:
class MeCabSentenceSplitter(object):
    def __init__(self, mecab_dict_path='/usr/local/lib/mecab/dic/mecab-ipadic-neologd'):
        if mecab_dict_path is not None:
            self.mecab = MeCab.Tagger('-d {} -O wakati'.format(mecab_dict_path))
        else:
            self.mecab = MeCab.Tagger('-O wakati')

    def __call__(self, text):
        return self.mecab.parse(text).strip()

In [None]:
def ddqa_copy_format(ori_dic):
    output_dic = {'version':ori_dic['version'],'data':[]}
    data_dic = {'title':ori_dic['data'][0]['title'],'paragraphs':[]}
    output_dic['data'].append(data_dic)
    return output_dic

In [None]:
ddqa_path = os.path.join(root_dir,'data/ddqa/RC-QA')
train_path = os.path.join(ddqa_path,'DDQA-1.0_RC-QA_train.json')
val_path = os.path.join(ddqa_path,'DDQA-1.0_RC-QA_dev.json')
test_path = os.path.join(ddqa_path,'DDQA-1.0_RC-QA_test.json')

train_ori = json.load(open(train_path,encoding='utf8'))
val_ori = json.load(open(val_path,encoding='utf8'))
test_ori = json.load(open(test_path,encoding='utf8'))

train_output = ddqa_copy_format(train_ori)
val_output = ddqa_copy_format(val_ori)
test_output = ddqa_copy_format(test_ori)

train_norm = ddqa_copy_format(train_ori)
val_norm = ddqa_copy_format(val_ori)
test_norm = ddqa_copy_format(test_ori)

In [None]:
def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with open(vocab_file, encoding='utf8') as reader:
        while True:
            token = reader.readline()
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab

def pre_processing_context(sent_splitter,line,vocab_file):
    tokens = sent_splitter(line).lower().split()
    
    vocab_index = load_vocab(vocab_file)
    tokenizer = WordpieceTokenizer(vocab=vocab_index, unk_token='[UNK]')
    tokens = [sub_token for token in tokens for sub_token in tokenizer.tokenize(token)]
    
    return ' '.join(tokens)

def pre_processing(sent_splitter,line,vocab_file,max_seq_len=512):
    tokens = sent_splitter(line).lower().split()
    
    vocab_index = load_vocab(vocab_file)
    tokenizer = WordpieceTokenizer(vocab=vocab_index, unk_token='[UNK]')
    tokens = [sub_token for token in tokens for sub_token in tokenizer.tokenize(token)]
    
    if len(tokens)>max_seq_len-2:
        tokens = tokens[:max_seq_len-2]
    
    return ' '.join(tokens)

In [None]:
def read_data(ori_dic):
    for para in ori_dic['data'][0]['paragraphs']:
        yield para
        
def read_data_test(ori_dic):
    for para in ori_dic['data'][0]['paragraphs'][:1]:
        yield para

def tokenize_para(ori_para,vocab_file,sent_splitter):
    output_para = {'context':'','qas':[]}
    
    context = ori_para['context']
    output_context = pre_processing_context(sent_splitter,context,vocab_file)
    output_para['context'] = output_context
    
    for qas in ori_para['qas']:
        qas_dic_format = {'id':'','question':'','answers':[],'is_impossible':None}
        qas_dic_format['id'] = qas['id']
        qas_dic_format['is_impossible'] = qas['is_impossible']
        
        question =  qas['question']
        tokenized_question = pre_processing(sent_splitter,question,vocab_file)
        qas_dic_format['question'] = tokenized_question
        
        for answer in qas['answers']:
            answers_dic_format = {'text':'','answer_start':-1}
            answers_dic_format['answer_start'] = answer['answer_start']
            
            text = answer['text']
            tokenized_text = pre_processing(sent_splitter,text,vocab_file)
            answers_dic_format['text'] = tokenized_text
            
            qas_dic_format['answers'].append(answers_dic_format)
        
        output_para['qas'].append(qas_dic_format)
        
    return output_para

def normalize_para(ori_para):
    norm_para = {'context':'','qas':[]}
    
    context = ori_para['context'].replace(" ", ".").replace("…",".").replace('‥','.')
    norm_context = unicodedata.normalize('NFKC', context).replace(' ','')
    norm_para['context'] = norm_context
    
    for qas in ori_para['qas']:
        qas_dic_format = {'id':'','question':'','answers':[],'is_impossible':None}
        qas_dic_format['id'] = qas['id']
        qas_dic_format['is_impossible'] = qas['is_impossible']
        
        question = qas['question'].replace(" ", ".").replace("…",".").replace('‥','.')
        norm_question = unicodedata.normalize('NFKC', question).replace(' ','')
        qas_dic_format['question'] = norm_question
        
        for answer in qas['answers']:
            answers_dic_format = {'text':'','answer_start':-1}
            answers_dic_format['answer_start'] = answer['answer_start']
            
            text = answer['text'].replace(" ", ".").replace("…",".").replace('‥','.')
            norm_text = unicodedata.normalize('NFKC', text).replace(' ','')
            answers_dic_format['text'] = norm_text
            
            qas_dic_format['answers'].append(answers_dic_format)
        
        norm_para['qas'].append(qas_dic_format)
        
    return norm_para
    



In [None]:
vocab_file = os.path.join(root_dir,'model/namco_distilbert/vocab-lower.txt')
sent_splitter = MeCabSentenceSplitter()

train_output = ddqa_copy_format(train_ori)
train_norm = ddqa_copy_format(train_ori)

for para in tqdm(read_data(train_ori)):
    norm_para = normalize_para(para)
    train_norm['data'][0]['paragraphs'].append(norm_para)
    
    output_para = tokenize_para(norm_para,vocab_file,sent_splitter)
    train_output['data'][0]['paragraphs'].append(output_para)

train_norm_path = os.path.join(ddqa_path,'namco_normalized_DDQA-1.0_RC-QA_train.json')
json.dump(train_norm,open(train_norm_path,'w',encoding='utf8'),ensure_ascii=False)

train_output_path = os.path.join(ddqa_path,'namco_tokenized_DDQA-1.0_RC-QA_train.json')
json.dump(train_output,open(train_output_path,'w',encoding='utf8'),ensure_ascii=False)


In [None]:
vocab_file = os.path.join(root_dir,'model/namco_distilbert/vocab-lower.txt')
sent_splitter = MeCabSentenceSplitter()

val_output = ddqa_copy_format(val_ori)
val_norm = ddqa_copy_format(val_ori)

for para in tqdm(read_data(val_ori)):
    norm_para = normalize_para(para)
    val_norm['data'][0]['paragraphs'].append(norm_para)
    
    output_para = tokenize_para(norm_para,vocab_file,sent_splitter)
    val_output['data'][0]['paragraphs'].append(output_para)

val_norm_path = os.path.join(ddqa_path,'namco_normalized_DDQA-1.0_RC-QA_dev.json')
json.dump(val_norm,open(val_norm_path,'w',encoding='utf8'),ensure_ascii=False)  

val_output_path = os.path.join(ddqa_path,'namco_tokenized_DDQA-1.0_RC-QA_dev.json')
json.dump(val_output,open(val_output_path,'w',encoding='utf8'),ensure_ascii=False)


In [None]:
vocab_file = os.path.join(root_dir,'model/namco_distilbert/vocab-lower.txt')
sent_splitter = MeCabSentenceSplitter()

test_output = ddqa_copy_format(test_ori)
test_norm = ddqa_copy_format(test_ori)

for para in tqdm(read_data(test_ori)):
    norm_para = normalize_para(para)
    test_norm['data'][0]['paragraphs'].append(norm_para)
    
    output_para = tokenize_para(norm_para,vocab_file,sent_splitter)
    test_output['data'][0]['paragraphs'].append(output_para)

test_norm_path = os.path.join(ddqa_path,'namco_ normalized_DDQA-1.0_RC-QA_test.json')
json.dump(test_norm,open(test_norm_path,'w',encoding='utf8'),ensure_ascii=False)
    
test_output_path = os.path.join(ddqa_path,'namco_tokenized_DDQA-1.0_RC-QA_test.json')
json.dump(test_output,open(test_output_path,'w',encoding='utf8'),ensure_ascii=False)
