In [None]:
import sentencepiece as sp
import os
import csv

In [None]:
root_dir = os.path.abspath(os.path.join(os.getcwd(),os.pardir,os.pardir))


In [None]:
model_file = os.path.join(root_dir,'model/laboro_distilbert/tokenizer/ccc_13g_unigram.model')
tokenizer_sp = sp.SentencePieceProcessor(model_file=model_file)


In [None]:
def read_livedoor(path):
    all_labels = ['dokujo-tsushin', 'it-life-hack', 'kaden-channel', 'livedoor-homme', 'movie-enter', 'peachy', 'smax', 'sports-watch', 'topic-news']
    data_ = list(csv.reader(open(path,encoding='utf8'),delimiter='\t',quotechar=None))
    texts = []
    labels = []
    for l in data_[1:]:
        texts.append(l[0])
        labels.append(all_labels.index(l[1]))
    return texts,labels

In [None]:
livedoor_path = os.path.join(root_dir,'data/livedoor')
train_path = os.path.join(livedoor_path,'train.tsv')
val_path = os.path.join(livedoor_path,'dev.tsv')
test_path = os.path.join(livedoor_path,'test.tsv')

train_texts, train_labels = read_livedoor(train_path)
val_texts, val_labels = read_livedoor(val_path)
test_texts, test_labels = read_livedoor(test_path)

In [None]:
def pre_processing(tokenizer_sp,line,max_seq_len=512):
    ids = tokenizer_sp.encode(line, out_type=int)
    if len(ids)>max_seq_len-2:
        ids = ids[:max_seq_len-2]
    tokens = tokenizer_sp.id_to_piece(ids)
    #print(tokens)
    return ' '.join(tokens)

In [None]:
tokenized_train_texts = []
for text in train_texts:
    tokenized_train_texts.append(pre_processing(tokenizer_sp,text))
assert len(tokenized_train_texts)==len(train_texts)

train_output_path = os.path.join(livedoor_path,'train_tokenized.txt')
with open(train_output_path,'w',encoding='utf8') as train_output:
    for i in range(len(tokenized_train_texts)):
        train_output.write(f'{tokenized_train_texts[i]}\t{train_labels[i]}\n')

In [None]:
tokenized_val_texts = []
for text in val_texts:
    tokenized_val_texts.append(pre_processing(tokenizer_sp,text))
assert len(tokenized_val_texts)==len(val_texts)

val_output_path = os.path.join(livedoor_path,'dev_tokenized.txt')
with open(val_output_path,'w',encoding='utf8') as val_output:
    for i in range(len(tokenized_val_texts)):
        val_output.write(f'{tokenized_val_texts[i]}\t{val_labels[i]}\n')

In [None]:
tokenized_test_texts = []
for text in test_texts:
    tokenized_test_texts.append(pre_processing(tokenizer_sp,text))
assert len(tokenized_test_texts)==len(test_texts)

test_output_path = os.path.join(livedoor_path,'test_tokenized.txt')
with open(test_output_path,'w',encoding='utf8') as test_output:
    for i in range(len(tokenized_test_texts)):
        test_output.write(f'{tokenized_test_texts[i]}\t{test_labels[i]}\n')