In [None]:
import MeCab
import csv
import sentencepiece as sp
import os
import collections
import unicodedata
from transformers import WordpieceTokenizer
from tqdm.notebook import tqdm

In [None]:
root_dir = os.path.abspath(os.path.join(os.getcwd(),os.pardir,os.pardir))


In [None]:
class MeCabSentenceSplitter(object):
    def __init__(self, mecab_dict_path='/usr/local/lib/mecab/dic/mecab-ipadic-neologd'):
        if mecab_dict_path is not None:
            self.mecab = MeCab.Tagger('-d {} -O wakati'.format(mecab_dict_path))
        else:
            self.mecab = MeCab.Tagger('-O wakati')

    def __call__(self, text):
        return self.mecab.parse(text).strip()

In [None]:
def read_livedoor(path):
    all_labels = ['dokujo-tsushin', 'it-life-hack', 'kaden-channel', 'livedoor-homme', 'movie-enter', 'peachy', 'smax', 'sports-watch', 'topic-news']
    data_ = list(csv.reader(open(path,encoding='utf8'),delimiter='\t',quotechar=None))
    texts = []
    labels = []
    for l in data_[1:]:
        texts.append(l[0])
        labels.append(all_labels.index(l[1]))
    return texts,labels


In [None]:
livedoor_path = os.path.join(root_dir,'data/livedoor')
train_path = os.path.join(livedoor_path,'train.tsv')
val_path = os.path.join(livedoor_path,'dev.tsv')
test_path = os.path.join(livedoor_path,'test.tsv')

train_texts, train_labels = read_livedoor(train_path)
val_texts, val_labels = read_livedoor(val_path)
test_texts, test_labels = read_livedoor(test_path)

In [None]:
def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with open(vocab_file, encoding='utf8') as reader:
        while True:
            token = reader.readline()
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab

def pre_processing(sent_splitter,line,vocab_file,max_seq_len=512):
    line = unicodedata.normalize('NFKC', line).replace(' ','')
    
    tokens = sent_splitter(line).lower().split()
    
    vocab_index = load_vocab(vocab_file)
    tokenizer = WordpieceTokenizer(vocab=vocab_index, unk_token='[UNK]')
    tokens = [sub_token for token in tokens for sub_token in tokenizer.tokenize(token)]
    
    if len(tokens)>max_seq_len-2:
        tokens = tokens[:max_seq_len-2]
    return ' '.join(tokens)

In [None]:
vocab_file = os.path.join(root_dir,'model/namco_distilbert/vocab-lower.txt')
sent_splitter = MeCabSentenceSplitter()

tokenized_train_texts = []
for text in tqdm(train_texts):
    tokenized_train_texts.append(pre_processing(sent_splitter,text,vocab_file))
assert len(tokenized_train_texts)==len(train_texts)

train_output_path = os.path.join(livedoor_path,'namco_train_tokenized.tsv')
with open(train_output_path,'w',encoding='utf8') as train_output:
    for i in range(len(tokenized_train_texts)):
        train_output.write(f'{tokenized_train_texts[i]}\t{train_labels[i]}\n')

In [None]:
vocab_file = os.path.join(root_dir,'model/namco_distilbert/vocab-lower.txt')
sent_splitter = MeCabSentenceSplitter()

tokenized_val_texts = []
for text in tqdm(val_texts):
    tokenized_val_texts.append(pre_processing(sent_splitter,text,vocab_file))
assert len(tokenized_val_texts)==len(val_texts)

val_output_path = os.path.join(livedoor_path,'namco_dev_tokenized.tsv')
with open(val_output_path,'w',encoding='utf8') as val_output:
    for i in range(len(tokenized_val_texts)):
        val_output.write(f'{tokenized_val_texts[i]}\t{val_labels[i]}\n')

In [None]:
vocab_file = os.path.join(root_dir,'model/namco_distilbert/vocab-lower.txt')
sent_splitter = MeCabSentenceSplitter()

tokenized_test_texts = []
for text in tqdm(test_texts):
    tokenized_test_texts.append(pre_processing(sent_splitter,text,vocab_file))
assert len(tokenized_test_texts)==len(test_texts)

test_output_path = os.path.join(livedoor_path,'namco_test_tokenized.tsv')
with open(test_output_path,'w',encoding='utf8') as test_output:
    for i in range(len(tokenized_test_texts)):
        test_output.write(f'{tokenized_test_texts[i]}\t{test_labels[i]}\n')