In [16]:
import pandas as pd

from sklearn.model_selection import train_test_split

pd.set_option('mode.chained_assignment',  None)

In [17]:
r_food_d = pd.read_csv('output/JP_Final_Aug_df.csv', index_col=0)
r_food_d.head()

Unnamed: 0,SPEAKER,SENTENCE,개체명,지식베이스,intent_cat
0,고객,ソンベクトゥェジクッパプですよね？ 十人様程度のご予約を予定しております。事前にいつ連絡すれ...,"SongbaekdwaejiGukbap, 열 명, 예약, 언제, 전화","ソンベクトゥエジクッパプ|상호, 十人|인원",예약_문의
0,고객,ニワベーカリーですよね？ 3月25日頃、ミーティングがあるので、中で予約したいんですが、いつ...,"NiwaBakery, 삼월 이십오일, 모임, 안쪽, 예약, 언제","ニワベーカリー|상호, 3月25日|예약일, 中|위치",예약_문의
0,고객,ﾎﾟﾛﾄｺｯｶﾙですよね？ 五人様の予約は必要ですか?,"CheonhaDonkkaseu, 다섯 명, 예약","ﾎﾟﾛﾄｺｯｶﾙ|상호, 五人|인원",예약_문의
0,고객,ウンジヤクグですよね？およそ十人の会社の夕食を食べる予定ですが、事前に連絡する必要があります...,"Eunjiyakguk, 열 명, 회식, 미리, 연락","ウンジヤクグ|상호, 会社の夕食|행사, 十人|인원",예약_문의
0,고객,ハノイマエクジュバムゲオリグウォルジョムですよね？ 同じ日にできますか?,"Hanoimaekjubamgeoriguwoljeom, 당일 낮","ハノイマエクジュバムゲオリグウォルジョム|상호, 同じ日|시간",예약_문의


In [19]:
from transformers import AutoTokenizer

In [20]:
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")

In [21]:
r_food_d['TOKENIZED_SENTENCE'] = r_food_d['SENTENCE'].apply(lambda x: tokenizer.tokenize(x))
r_food_d['ENCODED_SENTENCE'] = r_food_d['SENTENCE'].apply(lambda x: tokenizer.encode(x))
r_food_d.head()

Unnamed: 0,SPEAKER,SENTENCE,개체명,지식베이스,intent_cat,TOKENIZED_SENTENCE,ENCODED_SENTENCE
0,고객,ソンベクトゥェジクッパプですよね？ 十人様程度のご予約を予定しております。事前にいつ連絡すれ...,"SongbaekdwaejiGukbap, 열 명, 예약, 언제, 전화","ソンベクトゥエジクッパプ|상호, 十人|인원",예약_문의,"[ソン, ##ベク, ##トゥ, ##ェ, ##ジ, ##ク, ##ッパ, ##プ, です,...","[2, 20317, 24734, 2783, 28592, 28510, 28488, 1..."
0,고객,ニワベーカリーですよね？ 3月25日頃、ミーティングがあるので、中で予約したいんですが、いつ...,"NiwaBakery, 삼월 이십오일, 모임, 안쪽, 예약, 언제","ニワベーカリー|상호, 3月25日|예약일, 中|위치",예약_문의,"[ニ, ##ワ, ##ベー, ##カリ, ##ー, です, よ, ね, ?, 3, 月, 2...","[2, 353, 28712, 15078, 7856, 28451, 2992, 54, ..."
0,고객,ﾎﾟﾛﾄｺｯｶﾙですよね？ 五人様の予約は必要ですか?,"CheonhaDonkkaseu, 다섯 명, 예약","ﾎﾟﾛﾄｺｯｶﾙ|상호, 五人|인원",예약_문의,"[ポロ, ##ト, ##コ, ##ッカ, ##ル, です, よ, ね, ?, 五, 人, 様...","[2, 18702, 28476, 28539, 3259, 28467, 2992, 54..."
0,고객,ウンジヤクグですよね？およそ十人の会社の夕食を食べる予定ですが、事前に連絡する必要があります...,"Eunjiyakguk, 열 명, 회식, 미리, 연락","ウンジヤクグ|상호, 会社の夕食|행사, 十人|인원",예약_문의,"[ウン, ##ジ, ##ヤ, ##ク, ##グ, です, よ, ね, ?, およそ, 十, ...","[2, 15085, 28510, 28842, 28488, 28530, 2992, 5..."
0,고객,ハノイマエクジュバムゲオリグウォルジョムですよね？ 同じ日にできますか?,"Hanoimaekjubamgeoriguwoljeom, 당일 낮","ハノイマエクジュバムゲオリグウォルジョム|상호, 同じ日|시간",예약_문의,"[ハノイ, ##マ, ##エク, ##ジュ, ##バム, ##ゲ, ##オリ, ##グ, #...","[2, 27619, 28523, 19783, 2126, 6738, 28788, 10..."


In [22]:
entity_slot = []
slot_vocab = set()

for vals in r_food_d.values:

    # raw_entity = r_food_d.iloc[i]['지식베이스']
    raw_entity = vals[3]
    encoded_sentence = vals[-1][1:-1]

    if type(raw_entity)==float: # 없는 경우
    
        entity_slot.append(['O']* len(encoded_sentence))
    
        continue

    split_entity = raw_entity.split(', ')

    raw_entity_label = []
    raw_entity_name = []

    if len(split_entity)>1: # 여러개 ENTITY
 
        for k in split_entity:      

            entity_name = k.split('|')[0]
            entity_label = k.split('|')[1]

            raw_entity_name.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(entity_name)))
            raw_entity_label.append(entity_label)
   
    else:

        split_entity = raw_entity.split('|')

        for idx, NAME_or_LABEL in enumerate(split_entity):
            
            # [이름, 라벨]
            if idx % 2 == 0: # ENTITY_NAME

                entity_name = NAME_or_LABEL
                raw_entity_name.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(entity_name)))

            else: # ENTITY_LABEL

                entity_label = NAME_or_LABEL
                raw_entity_label.append(NAME_or_LABEL) 

    # sentence_token = ', '.join([str(ENCODED_SENT) for ENCODED_SENT in encoded_sentence])
   
    entity_replace_loc = []
    entity_replace_label = []
    
    for n,l in zip(raw_entity_name, raw_entity_label):

        token_num_list = [str(token_num) for token_num in n]
        bi_list = ['B-' + l if idx == 0 else 'I-'+ l for idx in range(len(token_num_list))]
        slot_vocab.add('B-' + l)
        slot_vocab.add('I-' + l)

        joined_t = ', '.join(token_num_list)
        joined_bi_list = ', '.join(bi_list)

        entity_replace_loc.append(joined_t)
        entity_replace_label.append(joined_bi_list)

    loc_label_dic = {k:v for k,v in zip(entity_replace_loc, entity_replace_label)}

    entity = ['O']* len(encoded_sentence)

    for idx, t in enumerate(entity_replace_loc):

        a = 0
        b = 0

        while a < len(encoded_sentence):

            tmp_loc = t.split(', ')
            tmp_loc = [int(t) for t in tmp_loc]

            if encoded_sentence[a] == tmp_loc[0]:

                if encoded_sentence[a:a+len(tmp_loc)] == tmp_loc:
                    
                    entity[a:a+len(tmp_loc)] = loc_label_dic[str(tmp_loc).replace('[','').replace(']','')].split(', ')

                    a += len(tmp_loc)
                    
                else:

                    a += 1

            else:

                a += 1

            
    entity_slot.append(entity)
    
slot_vocab = sorted(list(slot_vocab), key=lambda x: (x[2:], x[:2]))

r_food_d['ENTITY_SLOT'] = entity_slot

In [23]:
r_food_d['length'] = r_food_d['TOKENIZED_SENTENCE'].apply(lambda x: len(x))
r_food_d['length'].max()

100

In [24]:
r_food_d['ES_LEN'] = r_food_d['ENTITY_SLOT'].apply(lambda x:len(x))

In [25]:
r_food_d['SENTENCE'] = r_food_d['SENTENCE'].apply(lambda x:x.replace('\n',''))

In [26]:
r_food_d.head()

Unnamed: 0,SPEAKER,SENTENCE,개체명,지식베이스,intent_cat,TOKENIZED_SENTENCE,ENCODED_SENTENCE,ENTITY_SLOT,length,ES_LEN
0,고객,ソンベクトゥェジクッパプですよね？ 十人様程度のご予約を予定しております。事前にいつ連絡すれ...,"SongbaekdwaejiGukbap, 열 명, 예약, 언제, 전화","ソンベクトゥエジクッパプ|상호, 十人|인원",예약_문의,"[ソン, ##ベク, ##トゥ, ##ェ, ##ジ, ##ク, ##ッパ, ##プ, です,...","[2, 20317, 24734, 2783, 28592, 28510, 28488, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, B-인원, I-인...",37,37
0,고객,ニワベーカリーですよね？ 3月25日頃、ミーティングがあるので、中で予約したいんですが、いつ...,"NiwaBakery, 삼월 이십오일, 모임, 안쪽, 예약, 언제","ニワベーカリー|상호, 3月25日|예약일, 中|위치",예약_문의,"[ニ, ##ワ, ##ベー, ##カリ, ##ー, です, よ, ね, ?, 3, 月, 2...","[2, 353, 28712, 15078, 7856, 28451, 2992, 54, ...","[B-상호, I-상호, I-상호, I-상호, I-상호, O, O, O, O, B-예...",36,36
0,고객,ﾎﾟﾛﾄｺｯｶﾙですよね？ 五人様の予約は必要ですか?,"CheonhaDonkkaseu, 다섯 명, 예약","ﾎﾟﾛﾄｺｯｶﾙ|상호, 五人|인원",예약_문의,"[ポロ, ##ト, ##コ, ##ッカ, ##ル, です, よ, ね, ?, 五, 人, 様...","[2, 18702, 28476, 28539, 3259, 28467, 2992, 54...","[B-상호, I-상호, I-상호, I-상호, I-상호, O, O, O, O, B-인...",19,19
0,고객,ウンジヤクグですよね？およそ十人の会社の夕食を食べる予定ですが、事前に連絡する必要があります...,"Eunjiyakguk, 열 명, 회식, 미리, 연락","ウンジヤクグ|상호, 会社の夕食|행사, 十人|인원",예약_문의,"[ウン, ##ジ, ##ヤ, ##ク, ##グ, です, よ, ね, ?, およそ, 十, ...","[2, 15085, 28510, 28842, 28488, 28530, 2992, 5...","[B-상호, I-상호, I-상호, I-상호, I-상호, O, O, O, O, O, ...",34,34
0,고객,ハノイマエクジュバムゲオリグウォルジョムですよね？ 同じ日にできますか?,"Hanoimaekjubamgeoriguwoljeom, 당일 낮","ハノイマエクジュバムゲオリグウォルジョム|상호, 同じ日|시간",예약_문의,"[ハノイ, ##マ, ##エク, ##ジュ, ##バム, ##ゲ, ##オリ, ##グ, #...","[2, 27619, 28523, 19783, 2126, 6738, 28788, 10...","[B-상호, I-상호, I-상호, I-상호, I-상호, I-상호, I-상호, I-상...",23,23


In [27]:
# date = '0502_BERT' # 66
date = 'TOHOKU' # 66

In [28]:
file_name = './{}/slot_label.txt'.format(date)
with open(file_name, 'w+') as f_w:
    additional_tokens = ["PAD", "UNK", "O"]
    for token in additional_tokens:
            f_w.write(token + '\n')

    for slot in slot_vocab:
        f_w.write(slot + '\n')

intent_cat_list = r_food_d['intent_cat'].unique().tolist()
intent_cat_list = sorted(intent_cat_list)


file_name = './{}/intent_label.txt'.format(date)

with open(file_name, 'w+') as f_w:
    additional_tokens = ["UNK"]
    for token in additional_tokens:
            f_w.write(token + '\n')

    for intent in intent_cat_list:
        f_w.write(intent + '\n')

In [29]:
total_dataset = r_food_d[['SENTENCE','intent_cat','TOKENIZED_SENTENCE','ENCODED_SENTENCE','ENTITY_SLOT']]

train_dataset, test_dataset = train_test_split(total_dataset, train_size= 0.8, random_state=42, stratify=total_dataset['intent_cat'])

train_dataset, dev_dataset = train_test_split(train_dataset, train_size= 0.75, random_state=42, stratify=train_dataset['intent_cat'])

In [30]:
train_seq_in = []

for seq_in in train_dataset['SENTENCE']:

    train_seq_in.append(seq_in)

file_name = './{}/train/seq.in'.format(date)

with open(file_name, 'w+') as file:
    file.write('\n'.join(train_seq_in)) 

test_seq_in = []

for seq_in in test_dataset['SENTENCE']:

    test_seq_in.append(seq_in)

file_name = './{}/test/seq.in'.format(date)

with open(file_name, 'w+') as file:
    file.write('\n'.join(test_seq_in)) 

dev_seq_in = []

for seq_in in dev_dataset['SENTENCE']:

    dev_seq_in.append(seq_in)

file_name = './{}/dev/seq.in'.format(date)

with open(file_name, 'w+') as file:
    file.write('\n'.join(dev_seq_in)) 

train_label = []

for ic in train_dataset['intent_cat']:

    train_label.append(ic)

file_name = './{}/train/label'.format(date)

with open(file_name, 'w+') as file:
    file.write('\n'.join(train_label)) 

test_label = []

for ic in test_dataset['intent_cat']:

    test_label.append(ic)

file_name = './{}/test/label'.format(date)

with open(file_name, 'w+') as file:
    file.write('\n'.join(test_label)) 

dev_label = []

for ic in dev_dataset['intent_cat']:

    dev_label.append(ic)

file_name = './{}/dev/label'.format(date)

with open(file_name, 'w+') as file:
    file.write('\n'.join(dev_label)) 

train_seq_out = []

for seq_out_list in train_dataset['ENTITY_SLOT']:

    temp_seq_out = []
    
    for out in seq_out_list:

        temp_seq_out.append(out)

    temp_seq_out = ' '.join(temp_seq_out)

    train_seq_out.append(temp_seq_out)

file_name = './{}/train/seq.out'.format(date)

with open(file_name, 'w+') as file:
    file.write('\n'.join(train_seq_out)) 

test_seq_out = []

for seq_out_list in test_dataset['ENTITY_SLOT']:

    temp_seq_out = []

    for out in seq_out_list:

        temp_seq_out.append(out)

    temp_seq_out = ' '.join(temp_seq_out)

    test_seq_out.append(temp_seq_out)
    

file_name = './{}/test/seq.out'.format(date)

with open(file_name, 'w+') as file:
    file.write('\n'.join(test_seq_out)) 

dev_seq_out = []

for seq_out_list in dev_dataset['ENTITY_SLOT']:

    temp_seq_out = []
    
    for out in seq_out_list:

        temp_seq_out.append(out)

    temp_seq_out = ' '.join(temp_seq_out)

    dev_seq_out.append(temp_seq_out)

file_name = './{}/dev/seq.out'.format(date)

with open(file_name, 'w+') as file:
    file.write('\n'.join(dev_seq_out))