# INSTALL PACKAGES

In [None]:
!pip install transformers
!pip install pytorch-crf
!pip install seqeval
!pip install easydict 



# MODELS

In [None]:

import torch
import torch
from transformers import XLMRobertaConfig, XLMRobertaModel, XLMRobertaTokenizer
from torchcrf import CRF
import torch.nn as nn
class IntentClassifier(nn.Module):
    def __init__(self, input_dim, num_intent_labels, dropout_rate=0.):
        super(IntentClassifier, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, num_intent_labels)

    def forward(self, x):
        x = self.dropout(x)
        return self.linear(x)


class SlotClassifier(nn.Module):
    def __init__(self, input_dim, num_slot_labels, dropout_rate=0.):
        super(SlotClassifier, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, num_slot_labels)

    def forward(self, x):
        x = self.dropout(x)
        return self.linear(x)


class JointXLMRoberta(XLMRobertaModel):
    def __init__(self, config, args, intent_label_lst, slot_label_lst):
        super(JointXLMRoberta, self ).__init__(config)
        self.args = args
        self.num_intent_labels = len(intent_label_lst)
        self.num_slot_labels = len(slot_label_lst)
        self.XLMRoberta = XLMRobertaModel(config = config)  #XLMRoberta config, control how the xlmroberta behave
        self.intent_classifier = IntentClassifier(input_dim = config.hidden_size, num_intent_labels = self.num_intent_labels, dropout_rate= args.dropout_rate)
        self.slot_classifier = SlotClassifier(input_dim = config.hidden_size, num_slot_labels = self.num_slot_labels, dropout_rate= args.dropout_rate)
        #No CRF

    def forward(self, input_ids, attention_mask,intent_label_ids, slot_labels_ids):
        outputs = self.XLMRoberta(input_ids, attention_mask, )
        # outputs: (last hidden state shape(batch_size, sequence_len, hidden_size)), (pooler output shape = (batch_size, hidden_size))
        # use pooler_output for sentence level classification
        sequence_output = outputs[0]
        pooled_output = sequence_output[:,0]
        
        intent_logits = self.intent_classifier(pooled_output)
        slot_logits = self.slot_classifier(sequence_output)
        # Total loss include intent loss and slot loss
        total_loss = 0
        
        # 1. Intent Loss
        if intent_label_ids is not None:
            intent_loss_fct = nn.CrossEntropyLoss()
            intent_loss = intent_loss_fct(intent_logits.view(-1, self.num_intent_labels), intent_label_ids.view(-1))
            total_loss += intent_loss

        # 2. Slot Loss
        if slot_labels_ids is not None:
            slot_loss_fct = nn.CrossEntropyLoss(ignore_index = self.args.ignore_index)
            #only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = slot_logits.view(-1, self.num_slot_labels)[active_loss]
                active_labels = slot_labels_ids.view(-1)[active_loss]
                slot_loss = slot_loss_fct(active_logits, active_labels)
            else: 
                slot_loss = slot_loss_fct(slot_logits.view(-1, self.num_slot_labels), slot_labels_ids.view(-1))

            total_loss += self.args.slot_loss_coef * slot_loss

        outputs =(total_loss, ) +  ((intent_logits, slot_logits), ) 
        return outputs #(loss), logits






# UTILS


In [None]:
import os
import random
import logging

import torch
import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score

from transformers import BertConfig, DistilBertConfig, AlbertConfig, XLMRobertaConfig
from transformers import BertTokenizer, DistilBertTokenizer, AlbertTokenizer, XLMRobertaTokenizer



MODEL_CLASSES = {

    'xlmroberta': (XLMRobertaConfig, JointXLMRoberta, XLMRobertaTokenizer)
}

MODEL_PATH_MAP = {
    'bert': 'bert-base-uncased',
    'distilbert': 'distilbert-base-uncased',
    'albert': 'albert-xxlarge-v1', 
    'xlmroberta': 'xlm-roberta-base'
}


def get_intent_labels(args):
    return [label.strip() for label in open(os.path.join(args.data_dir, args.task, args.intent_label_file), 'r', encoding='utf-8')]


def get_slot_labels(args):
    return [label.strip() for label in open(os.path.join(args.data_dir, args.task, args.slot_label_file), 'r', encoding='utf-8')]


def load_tokenizer(args):
    return MODEL_CLASSES[args.model_type][2].from_pretrained(args.model_name_or_path)


def init_logger():
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if not args.no_cuda and torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)


def compute_metrics(intent_preds, intent_labels, slot_preds, slot_labels):
    assert len(intent_preds) == len(intent_labels) == len(slot_preds) == len(slot_labels)
    results = {}
    intent_result = get_intent_acc(intent_preds, intent_labels)
    slot_result = get_slot_metrics(slot_preds, slot_labels)
    sementic_result = get_sentence_frame_acc(intent_preds, intent_labels, slot_preds, slot_labels)

    results.update(intent_result)
    results.update(slot_result)
    results.update(sementic_result)

    return results


def get_slot_metrics(preds, labels):
    assert len(preds) == len(labels)
    return {
        "slot_precision": precision_score(labels, preds),
        "slot_recall": recall_score(labels, preds),
        "slot_f1": f1_score(labels, preds)
    }


def get_intent_acc(preds, labels):
    acc = (preds == labels).mean()
    return {
        "intent_acc": acc
    }


def read_prediction_text(args):
    return [text.strip() for text in open(os.path.join(args.pred_dir, args.pred_input_file), 'r', encoding='utf-8')]


def get_sentence_frame_acc(intent_preds, intent_labels, slot_preds, slot_labels):

    # One prediction is considered Correct if its intent prediction and slot prediction are match exactly with the True label


    """For the cases that intent and all the slots are correct (in one sentence)"""
    # Get the intent comparison result
    intent_result = (intent_preds == intent_labels)

    # Get the slot comparision result
    slot_result = []
    for preds, labels in zip(slot_preds, slot_labels):
        assert len(preds) == len(labels)
        one_sent_result = True
        for p, l in zip(preds, labels):
            if p != l:
                one_sent_result = False
                break
        slot_result.append(one_sent_result)
    slot_result = np.array(slot_result)

    sementic_acc = np.multiply(intent_result, slot_result).mean()
    return {
        "sementic_frame_acc": sementic_acc
    }


# DATA LOADER

In [None]:
import os
import copy
import json
import logging

import torch
from torch.utils.data import TensorDataset

# from utils import get_intent_labels, get_slot_labels

logger = logging.getLogger(__name__)


class InputExample(object):
    """
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        words: list. The words of the sequence.
        intent_label: (Optional) string. The intent label of the example.
        slot_labels: (Optional) list. The slot labels of the example.
    """

    def __init__(self, guid, words, intent_label=None, slot_labels=None):
        self.guid = guid
        self.words = words
        self.intent_label = intent_label
        self.slot_labels = slot_labels

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, attention_mask, token_type_ids, intent_label_id, slot_labels_ids):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.intent_label_id = intent_label_id
        self.slot_labels_ids = slot_labels_ids

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


class JointProcessor(object):
    """Processor for the JointBERT data set """

    def __init__(self, args):
        self.args = args
        self.intent_labels = get_intent_labels(args)
        self.slot_labels = get_slot_labels(args)

        self.input_text_file = 'seq.in'
        self.intent_label_file = 'label'
        self.slot_labels_file = 'seq.out'

    @classmethod
    # read the whole seq.in file into a list, each elements is a sentence
    def _read_file(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8") as f:
            lines = []
            for line in f:
                lines.append(line.strip())
            return lines

    def _create_examples(self, texts, intents, slots, set_type):
        """Creates examples for the training and dev sets."""
        
        examples = []
        for i, (text, intent, slot) in enumerate(zip(texts, intents, slots)):
            guid = "%s-%s" % (set_type, i)
            # 1. input_text
            words = text.split()  # Some are spaced twice
            # 2. intent label are converted from raw text to its id
            intent_label = self.intent_labels.index(intent) if intent in self.intent_labels else self.intent_labels.index("UNK")
            # 3. slot, converted slot labels into its id
            slot_labels = []
            for s in slot.split():
                slot_labels.append(self.slot_labels.index(s) if s in self.slot_labels else self.slot_labels.index("UNK"))

            assert len(words) == len(slot_labels)
            #one
            examples.append(InputExample(guid=guid, words=words, intent_label=intent_label, slot_labels=slot_labels))
        return examples #this is a list of object InputExample, each InputExample object contain its words text, its intent label id, its slot labels ids and an guid like 'train-0'
        

    def get_examples(self, mode):
        """
        Args:
            mode: train, dev, test
        """
        data_path = os.path.join(self.args.data_dir, self.args.task, mode)
        logger.info("LOOKING AT {}".format(data_path))
        return self._create_examples(texts=self._read_file(os.path.join(data_path, self.input_text_file)),
                                     intents=self._read_file(os.path.join(data_path, self.intent_label_file)),
                                     slots=self._read_file(os.path.join(data_path, self.slot_labels_file)),
                                     set_type=mode)


processors = {
    "atis": JointProcessor,
    "snips": JointProcessor
}


def convert_examples_to_features(examples, max_seq_len, tokenizer,
                                 pad_token_label_id=-100,
                                 cls_token_segment_id=0,
                                 pad_token_segment_id=0,
                                 sequence_a_segment_id=0,
                                 mask_padding_with_zero=True):
    # Setting based on the current model type
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 5000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        # Tokenize word by word (for NER)
        tokens = []
        slot_labels_ids = []
        for word, slot_label in zip(example.words, example.slot_labels):
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [unk_token]  # For handling the bad-encoded word
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            slot_labels_ids.extend([int(slot_label)] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:(max_seq_len - special_tokens_count)]
            slot_labels_ids = slot_labels_ids[:(max_seq_len - special_tokens_count)]

        # Add [SEP] token
        tokens += [sep_token]
        slot_labels_ids += [pad_token_label_id]
        token_type_ids = [sequence_a_segment_id] * len(tokens)

        # Add [CLS] token
        tokens = [cls_token] + tokens
        slot_labels_ids = [pad_token_label_id] + slot_labels_ids
        token_type_ids = [cls_token_segment_id] + token_type_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        slot_labels_ids = slot_labels_ids + ([pad_token_label_id] * padding_length)

        assert len(input_ids) == max_seq_len, "Error with input length {} vs {}".format(len(input_ids), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with attention mask length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_ids) == max_seq_len, "Error with token type length {} vs {}".format(len(token_type_ids), max_seq_len)
        assert len(slot_labels_ids) == max_seq_len, "Error with slot labels length {} vs {}".format(len(slot_labels_ids), max_seq_len)

        intent_label_id = int(example.intent_label)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % example.guid)
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("intent_label: %s (id = %d)" % (example.intent_label, intent_label_id))
            logger.info("slot_labels: %s" % " ".join([str(x) for x in slot_labels_ids]))

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          intent_label_id=intent_label_id,
                          slot_labels_ids=slot_labels_ids
                          ))

    return features


def load_and_cache_examples(args, tokenizer, mode):
    processor = processors[args.task](args)

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        'cached_{}_{}_{}_{}'.format(
            mode,
            args.task,
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            args.max_seq_len
        )
    )

    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        # Load raw text data from dataset file
        logger.info("Creating features from dataset file at %s", args.data_dir)
        if mode == "train":
            examples = processor.get_examples("train")
        elif mode == "dev":
            examples = processor.get_examples("dev")
        elif mode == "test":
            examples = processor.get_examples("test")
        else:
            raise Exception("For mode, Only train, dev, test is available")

        # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
        pad_token_label_id = args.ignore_index
        #convert raw text data into features
        features = convert_examples_to_features(examples, args.max_seq_len, tokenizer,
                                                pad_token_label_id=pad_token_label_id)
        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_intent_label_ids = torch.tensor([f.intent_label_id for f in features], dtype=torch.long)
    all_slot_labels_ids = torch.tensor([f.slot_labels_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids, all_intent_label_ids, all_slot_labels_ids)
    return dataset


#TRAINER

In [None]:
import os
import logging
from tqdm import tqdm, trange, notebook

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertConfig, AdamW, get_linear_schedule_with_warmup

# from utils import MODEL_CLASSES, compute_metrics, get_intent_labels, get_slot_labels

logger = logging.getLogger(__name__)


class Trainer(object):
    def __init__(self, args, train_dataset=None, dev_dataset=None, test_dataset=None):
        self.args = args
        self.train_dataset = train_dataset
        self.dev_dataset = dev_dataset
        self.test_dataset = test_dataset

        self.intent_label_lst = get_intent_labels(args)
        self.slot_label_lst = get_slot_labels(args)

        # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
        self.pad_token_label_id = args.ignore_index

        self.config_class, self.model_class, _ = MODEL_CLASSES[args.model_type]
        self.config = self.config_class.from_pretrained(args.model_name_or_path, finetuning_task=args.task)
        self.model = self.model_class.from_pretrained(args.model_name_or_path,
                                                      config=self.config,
                                                      args=args,
                                                      intent_label_lst=self.intent_label_lst,
                                                      slot_label_lst=self.slot_label_lst)

        # GPU or CPU
        self.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
        self.model.to(self.device)

    def train(self):
        train_sampler = RandomSampler(self.train_dataset)
        train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.train_batch_size)

        if self.args.max_steps > 0:
            t_total = self.args.max_steps
            # number of epochs equal number of step // (number of batch) // number of grad accumulation
            self.args.num_train_epochs = self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs

        # Prepare optimizer and schedule (linear warmup and decay)
        #we just do not use weight decay to bias and layerNorm weights, for bias, in the formular of L2 regularization do not has bias term 
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'weight_decay': self.args.weight_decay},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total)

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(self.train_dataset))
        logger.info("  Num Epochs = %d", self.args.num_train_epochs)
        logger.info("  Total train batch size = %d", self.args.train_batch_size)
        logger.info("  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)
        logger.info("  Logging steps = %d", self.args.logging_steps)
        logger.info("  Save steps = %d", self.args.save_steps)

        global_step = 0     # step of learning
        tr_loss = 0.0
        self.model.zero_grad()

        train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch")
        # Iter over all epochs
        for _ in train_iterator:
            #iterate over batch
            epoch_iterator = notebook.tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                #set model to training mode
                self.model.train()
                # send each data point to GPU in each batch
                batch = tuple(t.to(self.device) for t in batch)  # GPU or CPU

                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'intent_label_ids': batch[3],
                          'slot_labels_ids': batch[4]}


                # forward input throught model
                outputs = self.model(**inputs)
                # model return (loss), logits, (hidden_states), (attentions) # Logits is a tuple of intent and slot logits

                loss = outputs[0]

                if self.args.gradient_accumulation_steps > 1:
                    loss = loss / self.args.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                if (step + 1) % self.args.gradient_accumulation_steps == 0:
                    # if grads > c,  g <- c*grads/norm(grads)
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm)

                    optimizer.step()    # one step of learning
                    scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()  
                    global_step += 1

                    if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0:
                        self.evaluate("dev")

                    if self.args.save_steps > 0 and global_step % self.args.save_steps == 0:
                        self.save_model()

                if 0 < self.args.max_steps < global_step:
                    epoch_iterator.close()
                    break

            if 0 < self.args.max_steps < global_step:
                train_iterator.close()
                break

        return global_step, tr_loss / global_step

    def evaluate(self, mode):
        if mode == 'test':
            dataset = self.test_dataset
        elif mode == 'dev':
            dataset = self.dev_dataset
        else:
            raise Exception("Only dev and test dataset available")

        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=self.args.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation on %s dataset *****", mode)
        logger.info("  Num examples = %d", len(dataset))
        logger.info("  Batch size = %d", self.args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        intent_preds = None
        slot_preds = None
        out_intent_label_ids = None
        out_slot_labels_ids = None

        self.model.eval()

        for batch in notebook.tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(self.device) for t in batch)
            with torch.no_grad():
                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'intent_label_ids': batch[3],
                          'slot_labels_ids': batch[4]}

                outputs = self.model(**inputs)
                tmp_eval_loss, (intent_logits, slot_logits) = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1

            # Intent prediction
            if intent_preds is None:
                intent_preds = intent_logits.detach().cpu().numpy()
                out_intent_label_ids = inputs['intent_label_ids'].detach().cpu().numpy()
            else:
                intent_preds = np.append(intent_preds, intent_logits.detach().cpu().numpy(), axis=0)
                out_intent_label_ids = np.append(
                    out_intent_label_ids, inputs['intent_label_ids'].detach().cpu().numpy(), axis=0)

            # Slot prediction
            if slot_preds is None:
                if self.args.use_crf:
                    # decode() in `torchcrf` returns list with best index directly
                    slot_preds = np.array(self.model.crf.decode(slot_logits))
                else:
                    slot_preds = slot_logits.detach().cpu().numpy()

                out_slot_labels_ids = inputs["slot_labels_ids"].detach().cpu().numpy()
            else:
                if self.args.use_crf:
                    slot_preds = np.append(slot_preds, np.array(self.model.crf.decode(slot_logits)), axis=0)
                else:
                    slot_preds = np.append(slot_preds, slot_logits.detach().cpu().numpy(), axis=0)

                out_slot_labels_ids = np.append(out_slot_labels_ids, inputs["slot_labels_ids"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        results = {
            "loss": eval_loss
        }

        # Intent result
        intent_preds = np.argmax(intent_preds, axis=1)

        # Slot result
        if not self.args.use_crf:
            slot_preds = np.argmax(slot_preds, axis=2)
        slot_label_map = {i: label for i, label in enumerate(self.slot_label_lst)}
        out_slot_label_list = [[] for _ in range(out_slot_labels_ids.shape[0])]
        slot_preds_list = [[] for _ in range(out_slot_labels_ids.shape[0])]

        for i in range(out_slot_labels_ids.shape[0]):
            for j in range(out_slot_labels_ids.shape[1]):
                if out_slot_labels_ids[i, j] != self.pad_token_label_id:
                    out_slot_label_list[i].append(slot_label_map[out_slot_labels_ids[i][j]])
                    slot_preds_list[i].append(slot_label_map[slot_preds[i][j]])

        total_result = compute_metrics(intent_preds, out_intent_label_ids, slot_preds_list, out_slot_label_list)
        results.update(total_result)

        logger.info("***** Eval results *****")
        for key in sorted(results.keys()):
            logger.info("  %s = %s", key, str(results[key]))

        return results

    def save_model(self):
        # Save model checkpoint (Overwrite)
        if not os.path.exists(self.args.model_dir):
            os.makedirs(self.args.model_dir)
        model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
        model_to_save.save_pretrained(self.args.model_dir)

        # Save training arguments together with the trained model
        torch.save(self.args, os.path.join(self.args.model_dir, 'training_args.bin'))
        logger.info("Saving model checkpoint to %s", self.args.model_dir)

    def load_model(self):
        # Check whether model exists
        if not os.path.exists(self.args.model_dir):
            raise Exception("Model doesn't exists! Train first!")

        try:
            self.model = self.model_class.from_pretrained(self.args.model_dir,
                                                          args=self.args,
                                                          intent_label_lst=self.intent_label_lst,
                                                          slot_label_lst=self.slot_label_lst)
            self.model.to(self.device)
            logger.info("***** Model Loaded *****")
        except:
            raise Exception("Some model files might be missing...")


#PREDICT

In [None]:
import os
import logging
import argparse
from tqdm.notebook import tqdm, trange

import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from easydict import EasyDict as edict

# from utils import init_logger, load_tokenizer, get_intent_labels, get_slot_labels, MODEL_CLASSES

logger = logging.getLogger(__name__)


def get_device(pred_config):
    return "cuda" if torch.cuda.is_available() and not pred_config.no_cuda else "cpu"


def get_args(pred_config):
    return torch.load(os.path.join(pred_config.model_dir, 'training_args.bin'))


def load_model(pred_config, args, device):
    # Check whether model exists
    if not os.path.exists(pred_config.model_dir):
        raise Exception("Model doesn't exists! Train first!")

    try:
        model = MODEL_CLASSES[args.model_type][1].from_pretrained(args.model_dir,
                                                                  args=args,
                                                                  intent_label_lst=get_intent_labels(args),
                                                                  slot_label_lst=get_slot_labels(args))
        model.to(device)
        model.eval()
        logger.info("***** Model Loaded *****")
    except:
        raise Exception("Some model files might be missing...")

    return model


def read_input_file(pred_config):
    lines = []
    with open(pred_config.input_file, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            words = line.split()
            lines.append(words)

    return lines


def convert_input_file_to_tensor_dataset(lines,
                                         pred_config,
                                         args,
                                         tokenizer,
                                         pad_token_label_id,
                                         cls_token_segment_id=0,
                                         pad_token_segment_id=0,
                                         sequence_a_segment_id=0,
                                         mask_padding_with_zero=True):
    # Setting based on the current model type
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    all_input_ids = []
    all_attention_mask = []
    all_token_type_ids = []
    all_slot_label_mask = []

    for words in lines:
        tokens = []
        slot_label_mask = []
        for word in words:
            word_tokens = tokenizer.tokenize(word)
            if not word_tokens:
                word_tokens = [unk_token]  # For handling the bad-encoded word
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            slot_label_mask.extend([pad_token_label_id + 1] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP]
        special_tokens_count = 2
        if len(tokens) > args.max_seq_len - special_tokens_count:
            tokens = tokens[: (args.max_seq_len - special_tokens_count)]
            slot_label_mask = slot_label_mask[:(args.max_seq_len - special_tokens_count)]

        # Add [SEP] token
        tokens += [sep_token]
        token_type_ids = [sequence_a_segment_id] * len(tokens)
        slot_label_mask += [pad_token_label_id]

        # Add [CLS] token
        tokens = [cls_token] + tokens
        token_type_ids = [cls_token_segment_id] + token_type_ids
        slot_label_mask = [pad_token_label_id] + slot_label_mask

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = args.max_seq_len - len(input_ids)
        input_ids = input_ids + ([pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
        slot_label_mask = slot_label_mask + ([pad_token_label_id] * padding_length)

        all_input_ids.append(input_ids)
        all_attention_mask.append(attention_mask)
        all_token_type_ids.append(token_type_ids)
        all_slot_label_mask.append(slot_label_mask)

    # Change to Tensor
    all_input_ids = torch.tensor(all_input_ids, dtype=torch.long)
    all_attention_mask = torch.tensor(all_attention_mask, dtype=torch.long)
    all_token_type_ids = torch.tensor(all_token_type_ids, dtype=torch.long)
    all_slot_label_mask = torch.tensor(all_slot_label_mask, dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_slot_label_mask)

    return dataset


def predict(pred_config):
    # load model and args
    args = get_args(pred_config)
    device = get_device(pred_config)
    model = load_model(pred_config, args, device)
    logger.info(args)

    intent_label_lst = get_intent_labels(args)
    slot_label_lst = get_slot_labels(args)

    # Convert input file to TensorDataset
    pad_token_label_id = args.ignore_index
    tokenizer = load_tokenizer(args)
    lines = read_input_file(pred_config)
    dataset = convert_input_file_to_tensor_dataset(lines, pred_config, args, tokenizer, pad_token_label_id)

    # Predict
    sampler = SequentialSampler(dataset)
    data_loader = DataLoader(dataset, sampler=sampler, batch_size=pred_config.batch_size)

    all_slot_label_mask = None
    intent_preds = None
    slot_preds = None

    for batch in tqdm(data_loader, desc="Predicting"):
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0],
                      "attention_mask": batch[1],
                      "intent_label_ids": None,
                      "slot_labels_ids": None}

            outputs = model(**inputs)
            _, (intent_logits, slot_logits) = outputs[:2]

            # Intent Prediction
            if intent_preds is None:
                intent_preds = intent_logits.detach().cpu().numpy()
            else:
                intent_preds = np.append(intent_preds, intent_logits.detach().cpu().numpy(), axis=0)

            # Slot prediction
            if slot_preds is None:
                if args.use_crf:
                    # decode() in `torchcrf` returns list with best index directly
                    slot_preds = np.array(model.crf.decode(slot_logits))
                else:
                    slot_preds = slot_logits.detach().cpu().numpy()
                all_slot_label_mask = batch[3].detach().cpu().numpy()
            else:
                if args.use_crf:
                    slot_preds = np.append(slot_preds, np.array(model.crf.decode(slot_logits)), axis=0)
                else:
                    slot_preds = np.append(slot_preds, slot_logits.detach().cpu().numpy(), axis=0)
                all_slot_label_mask = np.append(all_slot_label_mask, batch[3].detach().cpu().numpy(), axis=0)

    intent_preds = np.argmax(intent_preds, axis=1)

    if not args.use_crf:
        slot_preds = np.argmax(slot_preds, axis=2)

    slot_label_map = {i: label for i, label in enumerate(slot_label_lst)}
    slot_preds_list = [[] for _ in range(slot_preds.shape[0])]

    for i in range(slot_preds.shape[0]):
        for j in range(slot_preds.shape[1]):
            if all_slot_label_mask[i, j] != pad_token_label_id:
                slot_preds_list[i].append(slot_label_map[slot_preds[i][j]])

    # Write to output file
    with open(pred_config.output_file, "w", encoding="utf-8") as f:
        for words, slot_preds, intent_pred in zip(lines, slot_preds_list, intent_preds):
            line = ""
            for word, pred in zip(words, slot_preds):
                if pred == 'O':
                    line = line + word + " "
                else:
                    line = line + "[{}:{}] ".format(word, pred)
            f.write("<{}> -> {}\n".format(intent_label_lst[intent_pred], line.strip()))

    logger.info("Prediction Done!")


if __name__ == "__main__":
    init_logger()
    # parser = argparse.ArgumentParser()

    # parser.add_argument("--input_file", default="sample_pred_in.txt", type=str, help="Input file for prediction")
    # parser.add_argument("--output_file", default="sample_pred_out.txt", type=str, help="Output file for prediction")
    # parser.add_argument("--model_dir", default="./atis_model", type=str, help="Path to save, load model")

    # parser.add_argument("--batch_size", default=32, type=int, help="Batch size for prediction")
    # parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")

    # pred_config = parser.parse_args()
    predict(edict({'input_file':'sample_pred_in.txt', 
                   'output_file': 'sample_pred_out.txt', 
                   'model_dir': '/content/drive/MyDrive/nlp projects/atis_data_vn', 
                   'batch_size' : 32, 
                   'no_cuda': False
                   }))


11/19/2020 15:42:35 - INFO - __main__ -   ***** Model Loaded *****
11/19/2020 15:42:35 - INFO - __main__ -   {'task': 'atis', 'model_dir': '/content/drive/MyDrive/nlp projects/atis_data_vn', 'data_dir': '/content/drive/MyDrive/nlp projects/atis_data_vn', 'intent_label_file': 'intent_label.txt', 'slot_label_file': 'slot_label.txt', 'model_type': 'xlmroberta', 'seed': 1234, 'train_batch_size': 32, 'eval_batch_size': 64, 'max_seq_len': 50, 'learning_rate': 5e-05, 'num_train_epochs': 53, 'weight_decay': 0.0, 'gradient_accumulation_steps': 4, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'max_steps': 1000, 'warmup_steps': 0, 'dropout_rate': 0.1, 'logging_steps': 100, 'save_steps': 500, 'do_train': True, 'do_eval': False, 'no_cuda': False, 'ignore_index': 0, 'slot_loss_coef': 1.0, 'use_crf': False, 'slot_pad_label': 'PAD', 'model_name_or_path': 'xlm-roberta-base'}


HBox(children=(FloatProgress(value=0.0, description='Predicting', max=1.0, style=ProgressStyle(description_wid…

11/19/2020 15:42:36 - INFO - __main__ -   Prediction Done!





#MAIN

In [None]:
#import argparse
from easydict import EasyDict as edict
# from trainer import Trainer
# from utils import init_logger, load_tokenizer, read_prediction_text, set_seed, MODEL_CLASSES, MODEL_PATH_MAP
# from data_loader import load_and_cache_examples


def main(args):
    init_logger()
    set_seed(args)
    #instantize a tokenizer
    tokenizer = load_tokenizer(args)

    train_dataset = load_and_cache_examples(args, tokenizer, mode="train")
    dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev")
    test_dataset = load_and_cache_examples(args, tokenizer, mode="test")

    trainer = Trainer(args, train_dataset, dev_dataset, test_dataset)

    if args.do_train:
        trainer.train()

    if args.do_eval:
        trainer.load_model()
        trainer.evaluate("test")


if __name__ == '__main__':
    # parser = argparse.ArgumentParser()

    # parser.add_argument("--task", default=None, required=True, type=str, help="The name of the task to train")
    # parser.add_argument("--model_dir", default=None, required=True, type=str, help="Path to save, load model")
    # parser.add_argument("--data_dir", default="./data", type=str, help="The input data dir")
    # parser.add_argument("--intent_label_file", default="intent_label.txt", type=str, help="Intent Label file")
    # parser.add_argument("--slot_label_file", default="slot_label.txt", type=str, help="Slot Label file")

    # parser.add_argument("--model_type", default="bert", type=str, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))

    # parser.add_argument('--seed', type=int, default=1234, help="random seed for initialization")
    # parser.add_argument("--train_batch_size", default=32, type=int, help="Batch size for training.")
    # parser.add_argument("--eval_batch_size", default=64, type=int, help="Batch size for evaluation.")
    # parser.add_argument("--max_seq_len", default=50, type=int, help="The maximum total input sequence length after tokenization.")
    # parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    # parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.")
    # parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    # parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
    #                     help="Number of updates steps to accumulate before performing a backward/update pass.")
    # parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    # parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    # parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    # parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    # parser.add_argument("--dropout_rate", default=0.1, type=float, help="Dropout for fully-connected layers")

    # parser.add_argument('--logging_steps', type=int, default=200, help="Log every X updates steps.")
    # parser.add_argument('--save_steps', type=int, default=200, help="Save checkpoint every X updates steps.")

    # parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    # parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the test set.")
    # parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")

    # parser.add_argument("--ignore_index", default=0, type=int,
    #                     help='Specifies a target value that is ignored and does not contribute to the input gradient')

    # parser.add_argument('--slot_loss_coef', type=float, default=1.0, help='Coefficient for the slot loss.')

    # # CRF option
    # parser.add_argument("--use_crf", action="store_true", help="Whether to use CRF")
    # parser.add_argument("--slot_pad_label", default="PAD", type=str, help="Pad token for slot label pad (to be ignore when calculate loss)")

    # args = parser.parse_args()


    # args.model_name_or_path = MODEL_PATH_MAP[args.model_type]
    # main(args)
     main(edict({'task':'atis','model_dir':'/content/drive/MyDrive/nlp projects/atis_data_vn', 
       'data_dir':'/content/drive/MyDrive/nlp projects/atis_data_vn', 
       'intent_label_file': 'intent_label.txt', 
       'slot_label_file':'slot_label.txt', 
       'model_type': 'xlmroberta', 
       'seed': 1234, 
       'train_batch_size': 32, 
       'eval_batch_size': 64, 
       'max_seq_len': 50, 
       'learning_rate': 5e-5,
       'num_train_epochs' : 50,
       'weight_decay': 0.0,
       'gradient_accumulation_steps': 4, 
       'adam_epsilon': 1e-8, 
       'max_grad_norm': 1.0,
       'max_steps': 1000, 
       'warmup_steps': 0,
       'dropout_rate': 0.1, 
       'logging_steps': 100, 
       'save_steps': 500,
       'do_train' : True, 
       'do_eval': False, 
       'no_cuda':False, 
       'ignore_index': 0,
       'slot_loss_coef': 1.0,
       'use_crf':False,
       'slot_pad_label':'PAD', 
       'model_name_or_path': 'xlm-roberta-base'

       }))


11/19/2020 14:39:27 - INFO - __main__ -   Loading features from cached file /content/drive/MyDrive/nlp projects/atis_data_vn/cached_train_atis_xlm-roberta-base_50
11/19/2020 14:39:27 - INFO - __main__ -   Loading features from cached file /content/drive/MyDrive/nlp projects/atis_data_vn/cached_dev_atis_xlm-roberta-base_50
11/19/2020 14:39:27 - INFO - __main__ -   Loading features from cached file /content/drive/MyDrive/nlp projects/atis_data_vn/cached_test_atis_xlm-roberta-base_50
Some weights of JointXLMRoberta were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['roberta.XLMRoberta.embeddings.word_embeddings.weight', 'roberta.XLMRoberta.embeddings.position_embeddings.weight', 'roberta.XLMRoberta.embeddings.token_type_embeddings.weight', 'roberta.XLMRoberta.embeddings.LayerNorm.weight', 'roberta.XLMRoberta.embeddings.LayerNorm.bias', 'roberta.XLMRoberta.encoder.layer.0.attention.self.query.weight', 'roberta.XLMRoberta.encoder.layer.0.attention

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:   2%|▏         | 1/53 [00:22<19:42, 22.74s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:   4%|▍         | 2/53 [00:45<19:27, 22.89s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:   6%|▌         | 3/53 [01:08<19:02, 22.85s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:   8%|▊         | 4/53 [01:31<18:36, 22.78s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:   9%|▉         | 5/53 [01:54<18:14, 22.80s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…

11/19/2020 14:41:39 - INFO - __main__ -   ***** Running evaluation on dev dataset *****
11/19/2020 14:41:39 - INFO - __main__ -     Num examples = 300
11/19/2020 14:41:39 - INFO - __main__ -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=5.0, style=ProgressStyle(description_wid…

11/19/2020 14:41:40 - INFO - __main__ -   ***** Eval results *****
11/19/2020 14:41:40 - INFO - __main__ -     intent_acc = 0.9233333333333333
11/19/2020 14:41:40 - INFO - __main__ -     loss = 0.8075066804885864
11/19/2020 14:41:40 - INFO - __main__ -     sementic_frame_acc = 0.25
11/19/2020 14:41:40 - INFO - __main__ -     slot_f1 = 0.57995337995338
11/19/2020 14:41:40 - INFO - __main__ -     slot_precision = 0.5460930640913082
11/19/2020 14:41:40 - INFO - __main__ -     slot_recall = 0.6182902584493042







Epoch:  11%|█▏        | 6/53 [02:18<18:05, 23.11s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  13%|█▎        | 7/53 [02:40<17:37, 22.99s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  15%|█▌        | 8/53 [03:03<17:11, 22.91s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  17%|█▋        | 9/53 [03:26<16:46, 22.87s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  19%|█▉        | 10/53 [03:48<16:21, 22.82s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…

11/19/2020 14:43:40 - INFO - __main__ -   ***** Running evaluation on dev dataset *****
11/19/2020 14:43:40 - INFO - __main__ -     Num examples = 300
11/19/2020 14:43:40 - INFO - __main__ -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=5.0, style=ProgressStyle(description_wid…

11/19/2020 14:43:41 - INFO - __main__ -   ***** Eval results *****
11/19/2020 14:43:41 - INFO - __main__ -     intent_acc = 0.9366666666666666
11/19/2020 14:43:41 - INFO - __main__ -     loss = 0.6722795486450195
11/19/2020 14:43:41 - INFO - __main__ -     sementic_frame_acc = 0.3233333333333333
11/19/2020 14:43:41 - INFO - __main__ -     slot_f1 = 0.6346691519105311
11/19/2020 14:43:41 - INFO - __main__ -     slot_precision = 0.5973684210526315
11/19/2020 14:43:41 - INFO - __main__ -     slot_recall = 0.6769383697813122







Epoch:  21%|██        | 11/53 [04:12<16:09, 23.08s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  23%|██▎       | 12/53 [04:35<15:41, 22.97s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  25%|██▍       | 13/53 [04:58<15:16, 22.91s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  26%|██▋       | 14/53 [05:20<14:51, 22.87s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  28%|██▊       | 15/53 [05:43<14:27, 22.83s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…

11/19/2020 14:45:40 - INFO - __main__ -   ***** Running evaluation on dev dataset *****
11/19/2020 14:45:40 - INFO - __main__ -     Num examples = 300
11/19/2020 14:45:40 - INFO - __main__ -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=5.0, style=ProgressStyle(description_wid…

11/19/2020 14:45:41 - INFO - __main__ -   ***** Eval results *****
11/19/2020 14:45:41 - INFO - __main__ -     intent_acc = 0.9466666666666667
11/19/2020 14:45:41 - INFO - __main__ -     loss = 0.5739607810974121
11/19/2020 14:45:41 - INFO - __main__ -     sementic_frame_acc = 0.44
11/19/2020 14:45:41 - INFO - __main__ -     slot_f1 = 0.7259050305594735
11/19/2020 14:45:41 - INFO - __main__ -     slot_precision = 0.6886708296164139
11/19/2020 14:45:41 - INFO - __main__ -     slot_recall = 0.7673956262425448







Epoch:  30%|███       | 16/53 [06:07<14:14, 23.10s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  32%|███▏      | 17/53 [06:30<13:48, 23.00s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  34%|███▍      | 18/53 [06:52<13:22, 22.93s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  36%|███▌      | 19/53 [07:15<12:57, 22.87s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  38%|███▊      | 20/53 [07:38<12:32, 22.82s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  40%|███▉      | 21/53 [08:01<12:09, 22.80s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…

11/19/2020 14:47:41 - INFO - __main__ -   ***** Running evaluation on dev dataset *****
11/19/2020 14:47:41 - INFO - __main__ -     Num examples = 300
11/19/2020 14:47:41 - INFO - __main__ -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=5.0, style=ProgressStyle(description_wid…

11/19/2020 14:47:42 - INFO - __main__ -   ***** Eval results *****
11/19/2020 14:47:42 - INFO - __main__ -     intent_acc = 0.95
11/19/2020 14:47:42 - INFO - __main__ -     loss = 0.5563386440277099
11/19/2020 14:47:42 - INFO - __main__ -     sementic_frame_acc = 0.49666666666666665
11/19/2020 14:47:42 - INFO - __main__ -     slot_f1 = 0.7690114068441064
11/19/2020 14:47:42 - INFO - __main__ -     slot_precision = 0.7367941712204007
11/19/2020 14:47:42 - INFO - __main__ -     slot_recall = 0.8041749502982107







Epoch:  42%|████▏     | 22/53 [08:24<11:56, 23.10s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  43%|████▎     | 23/53 [08:47<11:30, 23.01s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  45%|████▌     | 24/53 [09:10<11:06, 23.00s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  47%|████▋     | 25/53 [09:33<10:41, 22.92s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  49%|████▉     | 26/53 [09:56<10:18, 22.90s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…

11/19/2020 14:49:42 - INFO - __main__ -   ***** Running evaluation on dev dataset *****
11/19/2020 14:49:42 - INFO - __main__ -     Num examples = 300
11/19/2020 14:49:42 - INFO - __main__ -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=5.0, style=ProgressStyle(description_wid…

11/19/2020 14:49:43 - INFO - __main__ -   ***** Eval results *****
11/19/2020 14:49:43 - INFO - __main__ -     intent_acc = 0.95
11/19/2020 14:49:43 - INFO - __main__ -     loss = 0.5614642143249512
11/19/2020 14:49:43 - INFO - __main__ -     sementic_frame_acc = 0.5233333333333333
11/19/2020 14:49:43 - INFO - __main__ -     slot_f1 = 0.7873865265169612
11/19/2020 14:49:43 - INFO - __main__ -     slot_precision = 0.7580496780128795
11/19/2020 14:49:43 - INFO - __main__ -     slot_recall = 0.8190854870775348





11/19/2020 14:50:13 - INFO - __main__ -   Saving model checkpoint to /content/drive/MyDrive/nlp projects/atis_data_vn


Epoch:  51%|█████     | 27/53 [10:50<13:56, 32.16s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  53%|█████▎    | 28/53 [11:13<12:19, 29.59s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  55%|█████▍    | 29/53 [11:36<11:04, 27.69s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  57%|█████▋    | 30/53 [11:59<10:02, 26.18s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  58%|█████▊    | 31/53 [12:22<09:13, 25.17s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…

11/19/2020 14:52:14 - INFO - __main__ -   ***** Running evaluation on dev dataset *****
11/19/2020 14:52:14 - INFO - __main__ -     Num examples = 300
11/19/2020 14:52:14 - INFO - __main__ -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=5.0, style=ProgressStyle(description_wid…

11/19/2020 14:52:15 - INFO - __main__ -   ***** Eval results *****
11/19/2020 14:52:15 - INFO - __main__ -     intent_acc = 0.9533333333333334
11/19/2020 14:52:15 - INFO - __main__ -     loss = 0.5854770183563233
11/19/2020 14:52:15 - INFO - __main__ -     sementic_frame_acc = 0.52
11/19/2020 14:52:15 - INFO - __main__ -     slot_f1 = 0.777830412126954
11/19/2020 14:52:15 - INFO - __main__ -     slot_precision = 0.7429864253393665
11/19/2020 14:52:15 - INFO - __main__ -     slot_recall = 0.81610337972167







Epoch:  60%|██████    | 32/53 [12:46<08:40, 24.77s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  62%|██████▏   | 33/53 [13:08<08:03, 24.17s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  64%|██████▍   | 34/53 [13:31<07:30, 23.73s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  66%|██████▌   | 35/53 [13:54<07:01, 23.44s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  68%|██████▊   | 36/53 [14:17<06:35, 23.24s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…

11/19/2020 14:54:15 - INFO - __main__ -   ***** Running evaluation on dev dataset *****
11/19/2020 14:54:15 - INFO - __main__ -     Num examples = 300
11/19/2020 14:54:15 - INFO - __main__ -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=5.0, style=ProgressStyle(description_wid…

11/19/2020 14:54:16 - INFO - __main__ -   ***** Eval results *****
11/19/2020 14:54:16 - INFO - __main__ -     intent_acc = 0.95
11/19/2020 14:54:16 - INFO - __main__ -     loss = 0.5992169082164764
11/19/2020 14:54:16 - INFO - __main__ -     sementic_frame_acc = 0.5466666666666666
11/19/2020 14:54:16 - INFO - __main__ -     slot_f1 = 0.7994241842610365
11/19/2020 14:54:16 - INFO - __main__ -     slot_precision = 0.7727272727272727
11/19/2020 14:54:16 - INFO - __main__ -     slot_recall = 0.8280318091451292







Epoch:  70%|██████▉   | 37/53 [14:40<06:14, 23.38s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  72%|███████▏  | 38/53 [15:03<05:47, 23.18s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  74%|███████▎  | 39/53 [15:26<05:22, 23.03s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  75%|███████▌  | 40/53 [15:48<04:57, 22.90s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  77%|███████▋  | 41/53 [16:11<04:33, 22.83s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  79%|███████▉  | 42/53 [16:34<04:10, 22.77s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…

11/19/2020 14:56:15 - INFO - __main__ -   ***** Running evaluation on dev dataset *****
11/19/2020 14:56:15 - INFO - __main__ -     Num examples = 300
11/19/2020 14:56:15 - INFO - __main__ -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=5.0, style=ProgressStyle(description_wid…

11/19/2020 14:56:16 - INFO - __main__ -   ***** Eval results *****
11/19/2020 14:56:16 - INFO - __main__ -     intent_acc = 0.9533333333333334
11/19/2020 14:56:16 - INFO - __main__ -     loss = 0.6095861196517944
11/19/2020 14:56:16 - INFO - __main__ -     sementic_frame_acc = 0.54
11/19/2020 14:56:16 - INFO - __main__ -     slot_f1 = 0.7885985748218527
11/19/2020 14:56:16 - INFO - __main__ -     slot_precision = 0.7552320291173794
11/19/2020 14:56:16 - INFO - __main__ -     slot_recall = 0.8250497017892644







Epoch:  81%|████████  | 43/53 [16:57<03:50, 23.02s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  83%|████████▎ | 44/53 [17:20<03:26, 22.92s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  85%|████████▍ | 45/53 [17:43<03:02, 22.83s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  87%|████████▋ | 46/53 [18:05<02:39, 22.76s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  89%|████████▊ | 47/53 [18:28<02:16, 22.73s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…

11/19/2020 14:58:15 - INFO - __main__ -   ***** Running evaluation on dev dataset *****
11/19/2020 14:58:15 - INFO - __main__ -     Num examples = 300
11/19/2020 14:58:15 - INFO - __main__ -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=5.0, style=ProgressStyle(description_wid…

11/19/2020 14:58:16 - INFO - __main__ -   ***** Eval results *****
11/19/2020 14:58:16 - INFO - __main__ -     intent_acc = 0.9566666666666667
11/19/2020 14:58:16 - INFO - __main__ -     loss = 0.6278635263442993
11/19/2020 14:58:16 - INFO - __main__ -     sementic_frame_acc = 0.5366666666666666
11/19/2020 14:58:16 - INFO - __main__ -     slot_f1 = 0.7900999524036172
11/19/2020 14:58:16 - INFO - __main__ -     slot_precision = 0.7579908675799086
11/19/2020 14:58:16 - INFO - __main__ -     slot_recall = 0.8250497017892644







Epoch:  91%|█████████ | 48/53 [18:51<01:54, 22.99s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  92%|█████████▏| 49/53 [19:14<01:31, 22.89s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  94%|█████████▍| 50/53 [19:37<01:08, 22.84s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  96%|█████████▌| 51/53 [20:00<00:45, 22.78s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…



Epoch:  98%|█████████▊| 52/53 [20:22<00:22, 22.76s/it][A[A




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=76.0, style=ProgressStyle(description_wid…

11/19/2020 15:00:16 - INFO - __main__ -   ***** Running evaluation on dev dataset *****
11/19/2020 15:00:16 - INFO - __main__ -     Num examples = 300
11/19/2020 15:00:16 - INFO - __main__ -     Batch size = 64


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=5.0, style=ProgressStyle(description_wid…

11/19/2020 15:00:17 - INFO - __main__ -   ***** Eval results *****
11/19/2020 15:00:17 - INFO - __main__ -     intent_acc = 0.9533333333333334
11/19/2020 15:00:17 - INFO - __main__ -     loss = 0.6288009524345398
11/19/2020 15:00:17 - INFO - __main__ -     sementic_frame_acc = 0.5366666666666666
11/19/2020 15:00:17 - INFO - __main__ -     slot_f1 = 0.786527514231499
11/19/2020 15:00:17 - INFO - __main__ -     slot_precision = 0.7522686025408348
11/19/2020 15:00:17 - INFO - __main__ -     slot_recall = 0.8240556660039762





11/19/2020 15:00:45 - INFO - __main__ -   Saving model checkpoint to /content/drive/MyDrive/nlp projects/atis_data_vn
Epoch:  98%|█████████▊| 52/53 [21:08<00:24, 24.38s/it]







In [None]:
import torch
from transformers import RobertaTokenizer, XLMRobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')



In [None]:
s = 'I want to fly from Hanoi to HCM'
tokenizer(s)

In [None]:
xlm_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

In [None]:
xlm_tokenizer.tokenize(s)

In [None]:
tokenizer.tokenize(s)

In [None]:
xlm_tokenizer(s)

In [None]:
tokenizer.vocab_size

In [None]:
xlm_tokenizer.vocab_size