### 0. Import Library 

In [1]:
import pandas as pd, pymysql

pd.set_option('mode.chained_assignment',  None)

from transformers import ElectraTokenizer
from tokenization_kobert import KoBertTokenizer

from sklearn.model_selection import train_test_split

def live_db_conn():

    conn = pymysql.connect(host='host', 
                           user='user', 
                           password='password',
                           autocommit=True,
                           cursorclass=pymysql.cursors.DictCursor, 
                           db = "db")
    
    return conn

### 1. Load Data

In [2]:
data_path = "data_path"

r_food_d = pd.read_csv('{}/total_data.csv'.format(data_path),index_col=0)

### 2.  Tokenize SENTENCE & Encode SENTENCE

In [4]:
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
# tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

r_food_d['TOKENIZED_SENTENCE'] = r_food_d['SENTENCE'].apply(lambda x: tokenizer.tokenize(x))
r_food_d['ENCODED_SENTENCE'] = r_food_d['SENTENCE'].apply(lambda x: tokenizer.encode(x))

### 3. Entity Slot

In [5]:
entity_slot = []
slot_vocab = set()

for vals in r_food_d.values:

    raw_entity = vals[3]
    encoded_sentence = vals[-1][1:-1]

    if type(raw_entity)==float:
    
        entity_slot.append('')
        continue

    split_entity = raw_entity.split(', ')

    raw_entity_label = []
    raw_entity_name = []

    if len(split_entity)>1:
 
        for k in split_entity:      

            entity_name = k.split('|')[0]
            entity_label = k.split('|')[1]

            raw_entity_name.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(entity_name)))
            raw_entity_label.append(entity_label)
   
    else:

        split_entity = raw_entity.split('|')

        for idx, NAME_or_LABEL in enumerate(split_entity):
            
            
            if idx % 2 == 0: # ENTITY_NAME

                entity_name = NAME_or_LABEL
                raw_entity_name.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(entity_name)))

            else: # ENTITY_LABEL

                entity_label = NAME_or_LABEL
                raw_entity_label.append(NAME_or_LABEL) 

    entity_replace_loc = []
    entity_replace_label = []
    
    for n,l in zip(raw_entity_name, raw_entity_label):

        token_num_list = [str(token_num) for token_num in n]
        bi_list = ['B-' + l if idx == 0 else 'I-'+ l for idx in range(len(token_num_list))]
        
        slot_vocab.add('B-' + l)
        slot_vocab.add('I-' + l)

        joined_t = ', '.join(token_num_list)
        joined_bi_list = ', '.join(bi_list)

        entity_replace_loc.append(joined_t)
        entity_replace_label.append(joined_bi_list)

    loc_label_dic = {k:v for k,v in zip(entity_replace_loc, entity_replace_label)}

    entity = ['O']* len(encoded_sentence)

    for idx, t in enumerate(entity_replace_loc):

        a = 0
        b = 0

        while a < len(encoded_sentence):

            tmp_loc = t.split(', ')
            tmp_loc = [int(t) for t in tmp_loc]

            if encoded_sentence[a] == tmp_loc[0]:

                if encoded_sentence[a:a+len(tmp_loc)] == tmp_loc:
                    
                    entity[a:a+len(tmp_loc)] = loc_label_dic[str(tmp_loc).replace('[','').replace(']','')].split(', ')

                    a += len(tmp_loc)
                    
                else:

                    a += 1

            else:

                a += 1

            
    entity_slot.append(entity)
    
slot_vocab = sorted(list(slot_vocab), key=lambda x: (x[2:], x[:2]))

r_food_d['ENTITY_SLOT'] = entity_slot

### 4. SLOT LABEL & INTENT

In [6]:
output_path = 'output_path'

In [7]:
file_name = '{}/slot_label.txt'.format(output_path)
with open(file_name, 'w+') as f_w:
    
    additional_tokens = ["PAD", "UNK", "O"]

    for token in additional_tokens:
            
            f_w.write(token + '\n')

    for slot in slot_vocab:

        f_w.write(slot + '\n')

intent_cat_list = r_food_d['intent_cat'].unique().tolist()
intent_cat_list = sorted(intent_cat_list)


file_name = '{}/intent_label.txt'.format(output_path)

with open(file_name, 'w+') as f_w:
    
    additional_tokens = ["UNK"]

    for token in additional_tokens:
            
            f_w.write(token + '\n')

    for intent in intent_cat_list:
        
        f_w.write(intent + '\n')

### 5. Split Dataset (Train/Test/Dev)

In [8]:
total_dataset = r_food_d[['SENTENCE','intent_cat','TOKENIZED_SENTENCE','ENCODED_SENTENCE','ENTITY_SLOT']]

train_dataset, test_dataset = train_test_split(total_dataset, train_size= 0.8, random_state=42, stratify=total_dataset['intent_cat'])

train_dataset, dev_dataset = train_test_split(train_dataset, train_size= 0.75, random_state=42, stratify=train_dataset['intent_cat'])

### 6. SEQUENCE IN 

In [9]:
train_seq_in = []

for seq_in in train_dataset['SENTENCE']:

    train_seq_in.append(seq_in)

file_name = '{}/train/seq.in'.format(output_path)

with open(file_name, 'w+') as file:
    file.write('\n'.join(train_seq_in)) 

test_seq_in = []

for seq_in in test_dataset['SENTENCE']:

    test_seq_in.append(seq_in)

file_name = '{}/test/seq.in'.format(output_path)

with open(file_name, 'w+') as file:
    file.write('\n'.join(test_seq_in)) 

dev_seq_in = []

for seq_in in dev_dataset['SENTENCE']:

    dev_seq_in.append(seq_in)

file_name = '{}/dev/seq.in'.format(output_path)

with open(file_name, 'w+') as file:
    file.write('\n'.join(dev_seq_in)) 

### 7. INTENT_LABEL

In [10]:
train_label = []

for ic in train_dataset['intent_cat']:

    train_label.append(ic)

file_name = '{}/train/label'.format(output_path)

with open(file_name, 'w+') as file:
    file.write('\n'.join(train_label)) 

test_label = []

for ic in test_dataset['intent_cat']:

    test_label.append(ic)

file_name = '{}/test/label'.format(output_path)

with open(file_name, 'w+') as file:
    file.write('\n'.join(test_label)) 

dev_label = []

for ic in dev_dataset['intent_cat']:

    dev_label.append(ic)

file_name = '{}/dev/label'.format(output_path)

with open(file_name, 'w+') as file:
    file.write('\n'.join(dev_label)) 

### 8. SEQUENCE OUT

In [11]:
train_seq_out = []

for seq_out_list in train_dataset['ENTITY_SLOT']:

    temp_seq_out = []
    
    for out in seq_out_list:

        temp_seq_out.append(out)

    temp_seq_out = ' '.join(temp_seq_out)

    train_seq_out.append(temp_seq_out)

file_name = '{}/train/seq.out'.format(output_path)

with open(file_name, 'w+') as file:
    file.write('\n'.join(train_seq_out)) 

test_seq_out = []

for seq_out_list in test_dataset['ENTITY_SLOT']:

    temp_seq_out = []

    for out in seq_out_list:

        temp_seq_out.append(out)

    temp_seq_out = ' '.join(temp_seq_out)

    test_seq_out.append(temp_seq_out)
    

file_name = '{}/test/seq.out'.format(output_path)

with open(file_name, 'w+') as file:
    file.write('\n'.join(test_seq_out)) 

dev_seq_out = []

for seq_out_list in dev_dataset['ENTITY_SLOT']:

    temp_seq_out = []
    
    for out in seq_out_list:

        temp_seq_out.append(out)

    temp_seq_out = ' '.join(temp_seq_out)

    dev_seq_out.append(temp_seq_out)

file_name = '{}/dev/seq.out'.format(output_path)

with open(file_name, 'w+') as file:
    file.write('\n'.join(dev_seq_out))