In [1]:
import torch
import torch.nn as nn
from transformers import ElectraModel, ElectraTokenizer
from torchcrf import CRF
import copy, json, os, logging
import pandas as pd
import numpy as np

from torch.utils.data import TensorDataset

In [2]:
class IntentClassifier(nn.Module):
    def __init__(self, input_dim, num_intent_labels, dropout_rate=0.):
        super(IntentClassifier, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, num_intent_labels)

    def forward(self, x):
        x = self.dropout(x)
        return self.linear(x)


class SlotClassifier(nn.Module):
    def __init__(self, input_dim, num_slot_labels, dropout_rate=0.):
        super(SlotClassifier, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, num_slot_labels)

    def forward(self, x):
        x = self.dropout(x)
        return self.linear(x)

In [3]:
class InputExample(object):
    """
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        words: list. The words of the sequence.
        intent_label: (Optional) string. The intent label of the example.
        slot_labels: (Optional) list. The slot labels of the example.
    """

    def __init__(self, guid, words, intent_label=None, slot_labels=None):
        self.guid = guid
        self.words = words
        self.intent_label = intent_label
        self.slot_labels = slot_labels

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [4]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, attention_mask, token_type_ids, intent_label_id, slot_labels_ids):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.intent_label_id = intent_label_id
        self.slot_labels_ids = slot_labels_ids

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [5]:
electra_path = "monologg/koelectra-base-v3-discriminator"
tokenizer = ElectraTokenizer.from_pretrained(electra_path)

In [6]:
r_food_d = pd.read_excel('data/FOOD_DATA.xlsx')
r_food_d = r_food_d[r_food_d['SPEAKER']=='고객']
r_food_d['TOKENIZED_SENTENCE'] = r_food_d['SENTENCE'].apply(lambda x: tokenizer.tokenize(x))
r_food_d['ENCODED_SENTENCE'] = r_food_d['SENTENCE'].apply(lambda x: tokenizer.encode(x))

entity_slot = []

for vals in r_food_d.values:

    # raw_entity = r_food_d.iloc[i]['지식베이스']
    raw_entity = vals[-5]
    encoded_sentence = vals[-1][1:-1]

    if type(raw_entity)==float: # 없는 경우
    
        entity_slot.append('')
        continue

    split_entity = raw_entity.split(', ')

    raw_entity_label = []
    raw_entity_name = []

    if len(split_entity)>1: # 여러개 ENTITY
 
        for k in split_entity:      

            entity_name = k.split('/')[0]
            entity_label = k.split('/')[1]

            raw_entity_name.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(entity_name)))
            raw_entity_label.append(entity_label)
   
    else:

        split_entity = raw_entity.split('/')

        for idx, NAME_or_LABEL in enumerate(split_entity):
            
            # [이름, 라벨]
            if idx % 2 == 0: # ENTITY_NAME

                entity_name = NAME_or_LABEL
                raw_entity_name.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(entity_name)))

            else: # ENTITY_LABEL

                entity_label = NAME_or_LABEL
                raw_entity_label.append(NAME_or_LABEL) 

    sentence_token = ', '.join([str(ENCODED_SENT) for ENCODED_SENT in encoded_sentence])
   
    entity_replace_loc = []
    entity_replace_label = []

    for n,l in zip(raw_entity_name, raw_entity_label):

        token_num_list = [str(token_num) for token_num in n]
  
        bi_list = ['B-' + l if idx == 0 else 'I-'+ l for idx in range(len(token_num_list))]

        joined_t = ', '.join(token_num_list)
        joined_bi_list = ', '.join(bi_list)

        entity_replace_loc.append(joined_t)
        entity_replace_label.append(joined_bi_list)

    for loc, label in zip(entity_replace_loc,entity_replace_label):

        sentence_token = sentence_token.replace(loc,'#' + label + '#')

    entity_result = []

    for st_splited in sentence_token.split('#'):

        if '-' not in st_splited:

            if st_splited == '': continue
            
            tmp_1 = st_splited.replace(' ','')
            tmp_2 = tmp_1.split(',')
            tmp_3 = ['O' for x in tmp_2 if x!='']
            
            entity_result += tmp_3

        else:
            
            entity_result += st_splited.split(', ')
            
    entity_slot.append(entity_result)

r_food_d['ENTITY_SLOT'] = entity_slot

In [7]:
slot_label_list= []

for i in r_food_d['ENTITY_SLOT']:
    for j in i:
        slot_label_list.append(j)

slot_label_list = list(set(slot_label_list))

append_list = ['PAD','UNK']

for tk in append_list:
    slot_label_list.append(tk)

file_name = './rt_slot_label.txt'

with open(file_name, 'w+') as file:
    file.write('\n'.join(slot_label_list))  # '\n' 대신 ', '를 사용하면 줄바꿈이 아닌 ', '를 기준으로 문자열 구분함

intent_cat_list = r_food_d['intent_cat'].unique().tolist()
intent_cat_list.append('UNK')

file_name = './rt_intent_label.txt'

with open(file_name, 'w+') as file:
    file.write('\n'.join(intent_cat_list))  # '\n' 대신 ', '를 사용하면 줄바꿈이 아닌 ', '를 기준으로 문자열 구분함


In [8]:
total_dataset = r_food_d[['SENTENCE','intent_cat','TOKENIZED_SENTENCE','ENCODED_SENTENCE','ENTITY_SLOT']]
total_dataset = total_dataset[total_dataset['ENTITY_SLOT']!='']
from sklearn.model_selection import train_test_split

train_dataset, test_dataset = train_test_split(total_dataset, train_size= 0.8, random_state=42)

train_dataset, dev_dataset = train_test_split(train_dataset, train_size= 0.75, random_state=42)

In [9]:
train_seq_in = []

for seq_in in train_dataset['SENTENCE']:

    train_seq_in.append(seq_in)

file_name = './data/train/seq.in.txt'

with open(file_name, 'w+') as file:
    file.write('\n'.join(train_seq_in)) 

test_seq_in = []

for seq_in in test_dataset['SENTENCE']:

    test_seq_in.append(seq_in)

file_name = './data/test/seq.in.txt'

with open(file_name, 'w+') as file:
    file.write('\n'.join(test_seq_in)) 

dev_seq_in = []

for seq_in in dev_dataset['SENTENCE']:

    dev_seq_in.append(seq_in)

file_name = './data/dev/seq.in.txt'

with open(file_name, 'w+') as file:
    file.write('\n'.join(dev_seq_in)) 

In [10]:
train_label = []

for ic in train_dataset['intent_cat']:

    train_label.append(ic)

file_name = './data/train/label.txt'

with open(file_name, 'w+') as file:
    file.write('\n'.join(train_label)) 

test_label = []

for ic in test_dataset['intent_cat']:

    test_label.append(ic)

file_name = './data/test/label.txt'

with open(file_name, 'w+') as file:
    file.write('\n'.join(test_label)) 

dev_label = []

for ic in dev_dataset['intent_cat']:

    dev_label.append(ic)

file_name = './data/dev/label.txt'

with open(file_name, 'w+') as file:
    file.write('\n'.join(dev_label)) 

In [11]:
train_seq_out = []

for seq_out_list in train_dataset['ENTITY_SLOT']:

    temp_seq_out = []
    
    for out in seq_out_list:

        temp_seq_out.append(out)

    temp_seq_out = ' '.join(temp_seq_out)

    train_seq_out.append(temp_seq_out)

file_name = './data/train/seq.out.txt'

with open(file_name, 'w+') as file:
    file.write('\n'.join(train_seq_out)) 

test_seq_out = []

for seq_out_list in test_dataset['ENTITY_SLOT']:

    temp_seq_out = []

    for out in seq_out_list:

        temp_seq_out.append(out)

    temp_seq_out = ' '.join(temp_seq_out)

    test_seq_out.append(temp_seq_out)
    

file_name = './data/test/seq.out.txt'

with open(file_name, 'w+') as file:
    file.write('\n'.join(test_seq_out)) 

dev_seq_out = []

for seq_out_list in dev_dataset['ENTITY_SLOT']:

    temp_seq_out = []
    
    for out in seq_out_list:

        temp_seq_out.append(out)

    temp_seq_out = ' '.join(temp_seq_out)

    dev_seq_out.append(temp_seq_out)

file_name = './data/dev/seq.out.txt'

with open(file_name, 'w+') as file:
    file.write('\n'.join(dev_seq_out))

In [12]:
cls_token = tokenizer.cls_token_id
sep_token = tokenizer.sep_token_id
unk_token = tokenizer.unk_token_id
pad_token_id = tokenizer.pad_token_id
special_tokens_count = 2

print(cls_token, sep_token, unk_token, pad_token_id)

In [14]:
from transformers import ElectraConfig,BertConfig

In [15]:
args = {'seed':1234,
        'train_batch_size':10,
        'eval_batch_size':10,
        'max_seq_len':55,
        'learning_rate':5e-5,
        'num_train_epochs':5,
        'weight_decay':0.0,
        'gradient_accumulation_steps':1,
        'adam_epsilon':1e-8,
        'max_grad_norm':1.0,
        'max_steps':-1,
        'warmup_steps':0,
        'dropout_rate':0.1,
        'logging_steps':200,
        'save_steps':200,
        'use_crf': True,
        'slot_loss_coef':1.0,
        'ignore_index':0,
        'model_type':'electra',
        'model_name_or_path' : 'monologg/koelectra-base-v3-discriminator',
        'pretrained_model_name_or_path' : 'monologg/koelectra-base-v3-discriminator',
        'model_dir':'./data/model_save/',
        }

In [16]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertConfig, AdamW, get_linear_schedule_with_warmup,ElectraConfig
from tqdm import tqdm, trange
from transformers import ElectraModel, ElectraTokenizer

class ElectraPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output

class JointElectra(ElectraModel):
    def __init__(self,electra_path, config, args, intent_label_lst, slot_label_lst):
        super(JointElectra,self).__init__(config)
        self.args = args
        self.num_intent_labels = len(intent_label_lst)
        self.num_slot_labels = len(slot_label_lst)
        self.electra = ElectraModel(config=self.config)  # Load pretrained electra
        self.pooler = ElectraPooler(config)

        self.intent_classifier = IntentClassifier(config.hidden_size, self.num_intent_labels, args['dropout_rate'])
        self.slot_classifier = SlotClassifier(config.hidden_size, self.num_slot_labels, args['dropout_rate'])

        if args['use_crf']:
            self.crf = CRF(num_tags=self.num_slot_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, token_type_ids, intent_label_ids, slot_labels_ids):
        outputs = self.electra(input_ids, attention_mask=attention_mask,
                            token_type_ids=token_type_ids)  # sequence_output, pooled_output, (hidden_states), (attentions)
        sequence_output = outputs[0]
        pooled_output = self.pooler(sequence_output)  # [CLS]

        intent_logits = self.intent_classifier(pooled_output)
        slot_logits = self.slot_classifier(sequence_output)

        total_loss = 0
        # 1. Intent Softmax
        if intent_label_ids is not None:
            if self.num_intent_labels == 1:
                intent_loss_fct = nn.MSELoss()
                intent_loss = intent_loss_fct(intent_logits.view(-1), intent_label_ids.view(-1))
            else:
                intent_loss_fct = nn.CrossEntropyLoss()
                intent_loss = intent_loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label_ids.view(-1))
            total_loss += intent_loss

        # 2. Slot Softmax
        if slot_labels_ids is not None:
            if self.args['use_crf']:
                slot_loss = self.crf(slot_logits, slot_labels_ids, mask=attention_mask.byte(), reduction='mean')
                slot_loss = -1 * slot_loss  # negative log-likelihood
            else:
                slot_loss_fct = nn.CrossEntropyLoss(ignore_index=self.args['ignore_index'])
                # Only keep active parts of the loss
                if attention_mask is not None:
                    active_loss = attention_mask.view(-1) == 1
                    active_logits = slot_logits.view(-1, self.num_slot_labels)[active_loss]
                    active_labels = slot_labels_ids.view(-1)[active_loss]
                    slot_loss = slot_loss_fct(active_logits, active_labels)
                else:
                    slot_loss = slot_loss_fct(slot_logits.view(-1, self.num_slot_labels), slot_labels_ids.view(-1))
            total_loss += self.args['slot_loss_coef'] * slot_loss

        outputs = ((intent_logits, slot_logits),) + outputs[2:]  # add hidden states and attention if they are here

        outputs = (total_loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions) # Logits is a tuple of intent and slot logits

In [17]:
from transformers import ElectraConfig, ElectraModel, ElectraTokenizer

MODEL_CLASSES = {
    'electra': (ElectraConfig, JointElectra, ElectraTokenizer)
    }

MODEL_PATH_MAP = {
    'electra': 'monologg/koelectra-base-v3-discriminator'
}

In [18]:
def get_intent_labels(args):
    return [label.strip() for label in open(os.path.join('data', 'snips', 'intent_label.txt'), 'r', encoding='utf-8')]


def get_slot_labels(args):
    return [label.strip() for label in open(os.path.join('data', 'snips', 'slot_label.txt'), 'r', encoding='utf-8')]

In [19]:
class Trainer(object):
    def __init__(self, args, train_dataset=None, dev_dataset=None, test_dataset=None):
        self.args = args
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset

        self.intent_label_lst = get_intent_labels(args)
        self.slot_label_lst = get_slot_labels(args)
        # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
        self.pad_token_label_id = 74
        self.config = ElectraConfig.from_pretrained(args['model_name_or_path'], finetuning_task=args.task)
        self.model = JointElectra(args['model_name_or_path'],
                                #   config=ElectraConfig.from_pretrained(electra_path, finetuning_task='train'),
                                  config=self.config,
                                  args=args,
                                  intent_label_lst=self.intent_label_lst,
                                  slot_label_lst=self.slot_label_lst)

        # GPU or CPU
        self.device = "cuda:1" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def train(self):
        train_sampler = RandomSampler(self.train_dataset)
        train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args['train_batch_size'])

        if self.args['max_steps'] > 0:
            t_total = self.args['max_steps']
            self.args['num_train_epochs'] = self.args['max_steps'] // (len(train_dataloader) // self.args['gradient_accumulation_steps']) + 1
        else:
            t_total = len(train_dataloader) // self.args['gradient_accumulation_steps'] * self.args['num_train_epochs']

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': self.args['weight_decay']},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args['learning_rate'], eps=self.args['adam_epsilon'])
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args['warmup_steps'], num_training_steps=t_total)

        # Train!
        global_step = 0
        tr_loss = 0.0
        self.model.zero_grad()

        train_iterator = trange(int(self.args['num_train_epochs']), desc="Epoch")

        for _ in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batchs in enumerate(epoch_iterator):
                self.model.train()
                batch = tuple(t.to(self.device) for t in batchs)  # GPU or CPU

                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'intent_label_ids': batch[3],
                          'slot_labels_ids': batch[4]}
                if self.args['model_type'] != 'distilbert':
                    inputs['token_type_ids'] = batch[2]
                outputs = self.model(**inputs)
                loss = outputs[0]

                if self.args['gradient_accumulation_steps'] > 1:
                    loss = loss / self.args['gradient_accumulation_steps']

                loss.backward()

                tr_loss += loss.item()
                if (step + 1) % self.args['gradient_accumulation_steps'] == 0:
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args['max_grad_norm'])

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    global_step += 1

                    if self.args['logging_steps'] > 0 and global_step % self.args['logging_steps'] == 0:
                        self.evaluate("dev")

                    if self.args['save_steps'] > 0 and global_step % self.args['save_steps'] == 0:
                        self.save_model()

                if 0 < self.args['max_steps'] < global_step:
                    epoch_iterator.close()
                    break

            if 0 < self.args['max_steps'] < global_step:
                train_iterator.close()
                break

        return global_step, tr_loss / global_step

In [20]:
input_text_file = './data/train/seq.in.txt'
intent_label_file = './data/train/label.txt'
slot_labels_file = './data/train/seq.out.txt'

intent_label_lst = [label.strip() for label in open('./data/rt_intent_label.txt', 'r', encoding='utf-8')]

slot_label_lst = [label.strip() for label in open('./data/rt_slot_label.txt','r', encoding='utf-8')]

texts = []
with open(input_text_file, "r", encoding="utf-8") as f:
    for line in f:
        texts.append(line.strip())

intents = []
with open(intent_label_file, "r", encoding="utf-8") as f:
    for line in f:
        intents.append(line.strip())

slots = []
with open(slot_labels_file, "r", encoding="utf-8") as f:
    for line in f:
        slots.append(line.strip())

In [21]:
examples = []

set_type = 'train'

for i, (text, intent, slot) in enumerate(zip(texts, intents, slots)):
    
    guid = "%s-%s" % (set_type, i)
    # 1. input_text
    words = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))  # Some are spaced twice
    # 2. intent
    intent_label = intent_label_lst.index(intent) if intent in intent_label_lst else intent_label_lst.index("UNK")
    # 3. slot
    slot_label = []
    for s in slot.split():
        slot_label.append(slot_label_lst.index(s) if s in slot_label_lst else slot_label_lst.index("UNK"))
        
    examples.append(InputExample(guid=guid, words=words, intent_label=intent_label, slot_labels=slot_label))

In [22]:
tokenizer = ElectraTokenizer.from_pretrained(args['model_name_or_path'])

In [23]:
max_seq_len = 40
pad_token_label_id=slot_label_lst.index("PAD")
cls_token_segment_id=0
pad_token_segment_id=0
sequence_a_segment_id=0
mask_padding_with_zero=True

cls_token = tokenizer.cls_token_id
sep_token = tokenizer.sep_token_id
unk_token = tokenizer.unk_token_id
pad_token_id = tokenizer.pad_token_id

In [24]:
features = []
for (ex_index, example) in enumerate(examples):
    
    # Tokenize word by word (for NER)
    tokens = example.words
    slot_labels_ids = example.slot_labels

    # Account for [CLS] and [SEP]
    special_tokens_count = 2
    if len(tokens) > max_seq_len - special_tokens_count:
        tokens = tokens[:(max_seq_len - special_tokens_count)]
        slot_labels_ids = slot_labels_ids[:(max_seq_len - special_tokens_count)]

    # Add [SEP] token
    tokens += [sep_token]
    slot_labels_ids += [pad_token_label_id]
    token_type_ids = [sequence_a_segment_id] * len(tokens)

    # Add [CLS] token
    tokens = [cls_token] + tokens
    slot_labels_ids = [pad_token_label_id] + slot_labels_ids
    token_type_ids = [cls_token_segment_id] + token_type_ids

    input_ids = tokens

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_seq_len - len(input_ids)
    input_ids = input_ids + ([pad_token_id] * padding_length)
    attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
    token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
    slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length)
    
    assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
    assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
    assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_ids), max_seq_len)
    assert len(slot_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(len(slot_labels_ids), max_seq_len)

    intent_label_id = int(example.intent_label)

    features.append(
        InputFeatures(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        intent_label_id=intent_label_id,
                        slot_labels_ids=slot_labels_ids
                        ))

In [25]:
model = JointElectra(electra_path,
                                  config=ElectraConfig.from_pretrained(electra_path, finetuning_task='train'),
                                  args=args,
                                  intent_label_lst=intent_label_lst,
                                  slot_label_lst=slot_label_lst)


In [26]:
device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
# print(device)
model.to(device)

In [27]:
def save_model():
    # Save model checkpoint (Overwrite)
    if not os.path.exists(args['model_dir']):
        os.makedirs(args['model_dir'])
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(args['model_dir'])

    # Save training arguments together with the trained model
    torch.save(args, os.path.join(args['model_dir'], 'training_args.bin'))
    print("Saving model checkpoint to %s",args['model_dir'])

In [28]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
all_intent_label_ids = torch.tensor([f.intent_label_id for f in features], dtype=torch.long)
all_slot_labels_ids = torch.tensor([f.slot_labels_ids for f in features], dtype=torch.long)

torch_train_dataset = TensorDataset(all_input_ids, all_attention_mask,
                    all_token_type_ids, all_intent_label_ids, all_slot_labels_ids)

In [29]:
logger = logging.getLogger(__name__)

In [30]:
train_sampler = RandomSampler(torch_train_dataset)
train_dataloader = DataLoader(torch_train_dataset, sampler=train_sampler, batch_size=args['train_batch_size'])

if args['max_steps'] > 0:
    t_total = args['max_steps']
    num_train_epochs = args['max_steps'] // (len(train_dataloader) // args['gradient_accumulation_steps']) + 1
else:
    t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs']

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        'weight_decay': args['weight_decay']},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'], num_training_steps=t_total)

# Train!
logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(texts))
logger.info("  Num Epochs = %d", args['num_train_epochs'])
logger.info("  Total train batch size = %d", args['train_batch_size'])
logger.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
logger.info("  Total optimization steps = %d", t_total)
logger.info("  Logging steps = %d", args['logging_steps'])
logger.info("  Save steps = %d", args['save_steps'])

global_step = 0
tr_loss = 0.0
model.zero_grad()

train_iterator = trange(int(args['num_train_epochs']), desc="Epoch")

for _ in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(epoch_iterator):
        model.train()
        batch = tuple(t.to(device) for t in batch)  # GPU or CPU

        inputs = {'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'intent_label_ids': batch[3],
                    'slot_labels_ids': batch[4]}
        if args['model_type'] != 'distilbert':
            inputs['token_type_ids'] = batch[2]
        outputs = model(**inputs)
        loss = outputs[0]

        if args['gradient_accumulation_steps'] > 1:
            loss = loss / args['gradient_accumulation_steps']

        loss.backward()

        tr_loss += loss.item()
        if (step + 1) % args['gradient_accumulation_steps'] == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])

            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

            # if logging_steps > 0 and global_step % logging_steps == 0:
            #     evaluate("dev")

            if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
                save_model()

        if 0 < args['max_steps'] < global_step:
            epoch_iterator.close()
            break

    if 0 < args['max_steps'] < global_step:
        train_iterator.close()
        break

# global_step, tr_loss / global_step

In [31]:
def read_input_file(pred_file_dir):
    lines = []
    with open(pred_file_dir, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            lines.append(line)
    return lines

In [32]:
def convert_input_file_to_tensor_dataset(lines,
                                         args,
                                         tokenizer,
                                         pad_token_label_id=73,
                                         cls_token_segment_id=0,
                                         pad_token_segment_id=0,
                                         sequence_a_segment_id=0,
                                         mask_padding_with_zero=True):
    # Setting based on the current model type
    cls_token = tokenizer.cls_token_id
    sep_token = tokenizer.sep_token_id
    unk_token = tokenizer.unk_token_id
    pad_token_id = tokenizer.pad_token_id

    all_input_ids = []
    all_attention_mask = []
    all_token_type_ids = []
    all_slot_label_mask = []
    
    for words in lines:
        print(words)
        split_words = tokenizer.tokenize(words)
        tokens = tokenizer.convert_tokens_to_ids(split_words)
        slot_label_mask = []
        slot_label_mask.extend([pad_token_label_id + 1]*len(tokens))

        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        if len(tokens) > args['max_seq_len'] - special_tokens_count:
            tokens = tokens[: (args['max_seq_len'] - special_tokens_count)]
            slot_label_mask = slot_label_mask[:(args['max_seq_len'] - special_tokens_count)]

        # Add [SEP] token
        tokens += [sep_token]
        token_type_ids = [sequence_a_segment_id] * len(tokens)
        slot_label_mask += [pad_token_label_id]

        # Add [CLS] token
        tokens = [cls_token] + tokens
        token_type_ids = [cls_token_segment_id] + token_type_ids
        slot_label_mask = [pad_token_label_id] + slot_label_mask

        input_ids = tokens

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = args['max_seq_len'] - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        slot_label_mask = slot_label_mask + ([pad_token_label_id] * padding_length)

        all_input_ids.append(input_ids)
        all_attention_mask.append(attention_mask)
        all_token_type_ids.append(token_type_ids)
        all_slot_label_mask.append(slot_label_mask)

    # Change to Tensor
    all_input_ids = torch.tensor(all_input_ids, dtype=torch.long)
    all_attention_mask = torch.tensor(all_attention_mask, dtype=torch.long)
    all_token_type_ids = torch.tensor(all_token_type_ids, dtype=torch.long)
    all_slot_label_mask = torch.tensor(all_slot_label_mask, dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_slot_label_mask)

    return dataset

In [33]:
test_input_text_file = './data/test/seq.in.txt'
test_intent_label_file = './data/test/label.txt'
test_slot_labels_file = './data/test/seq.out.txt'

intent_label_lst = [label.strip() for label in open('./data/rt_intent_label.txt', 'r', encoding='utf-8')]

slot_label_lst = [label.strip() for label in open('./data/rt_slot_label.txt', 'r', encoding='utf-8')]

In [35]:
lines = read_input_file(test_input_text_file)
dataset = convert_input_file_to_tensor_dataset(lines, args, tokenizer, pad_token_label_id)

# Predict
sampler = SequentialSampler(dataset)
data_loader = DataLoader(dataset, sampler=sampler, batch_size=args['eval_batch_size'])

all_slot_label_mask = None
intent_preds = None
slot_preds = None

for batch in tqdm(data_loader, desc="Predicting"):
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        inputs = {"input_ids": batch[0],
                    "attention_mask": batch[1],
                    "intent_label_ids": None,
                    "slot_labels_ids": None}
        if args['model_type'] != "distilbert":
            inputs["token_type_ids"] = batch[2]
        outputs = model(**inputs)
        _, (intent_logits, slot_logits) = outputs[:2]

        # Intent Prediction
        if intent_preds is None:
            intent_preds = intent_logits.detach().cpu().numpy()
        else:
            intent_preds = np.append(intent_preds, intent_logits.detach().cpu().numpy(), axis=0)

        # Slot prediction
        if slot_preds is None:
            if args['use_crf']:
                # decode() in `torchcrf` returns list with best index directly
                slot_preds = np.array(model.crf.decode(slot_logits))
            else:
                slot_preds = slot_logits.detach().cpu().numpy()
            all_slot_label_mask = batch[3].detach().cpu().numpy()
        else:
            if args['use_crf']:
                slot_preds = np.append(slot_preds, np.array(model.crf.decode(slot_logits)), axis=0)
            else:
                slot_preds = np.append(slot_preds, slot_logits.detach().cpu().numpy(), axis=0)
            all_slot_label_mask = np.append(all_slot_label_mask, batch[3].detach().cpu().numpy(), axis=0)

intent_preds = np.argmax(intent_preds, axis=1)

if not args['use_crf']:
    slot_preds = np.argmax(slot_preds, axis=2)

slot_label_map = {i: label for i, label in enumerate(slot_label_lst)}
slot_preds_list = [[] for _ in range(slot_preds.shape[0])]

for i in range(slot_preds.shape[0]):
    for j in range(slot_preds.shape[1]):
        if all_slot_label_mask[i, j] != pad_token_label_id:
            slot_preds_list[i].append(slot_label_map[slot_preds[i][j]])

In [36]:
# Write to output file
with open('./result.txt', "w", encoding="utf-8") as f:
    for words, slot_preds, intent_pred in zip(lines, slot_preds_list, intent_preds):
        line = ""
        token_words = tokenizer.tokenize(words)
        for word, pred in zip(token_words, slot_preds):
            if pred == 'O':
                line = line + word + " "
            else:
                line = line + "[{}:{}] ".format(word, pred)
        f.write("<{}> -> {}\n".format(intent_label_lst[intent_pred], line.strip()))