In [1]:
import pandas as pd
import pyarrow as pa
import datasets as ds
from datasets import load_metric
import numpy as np
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer


In [3]:
# Setup visible GPUs, change as appropriate per available GPU systems
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2,3,4,5,6,7"

In [None]:
dataset_path = '/scratch/hle/git/MASSIVE'

In [4]:
val_dataset = ds.load_from_disk(os.path.join(dataset_path, '.dev'))
train_dataset = ds.load_from_disk(os.path.join(dataset_path, '.train'))
test_dataset = ds.load_from_disk(os.path.join(dataset_path, '.test'))

In [5]:
val_domains = []
for i in val_dataset:
    if i['domain'] not in val_domains:
        val_domains.append(i['domain'])

In [8]:
def label_mapping(domains):
    domains.sort()
    key_to_val = {k:v for k,v in enumerate(domains)}
    val_to_key = {v:k for k,v in enumerate(domains)}
    return key_to_val, val_to_key

In [9]:
key_to_val, val_to_key = label_mapping(val_domains)

In [10]:
key_to_val

{0: 'alarm',
 1: 'audio',
 2: 'calendar',
 3: 'cooking',
 4: 'datetime',
 5: 'email',
 6: 'general',
 7: 'iot',
 8: 'lists',
 9: 'music',
 10: 'news',
 11: 'play',
 12: 'qa',
 13: 'recommendation',
 14: 'social',
 15: 'takeaway',
 16: 'transport',
 17: 'weather'}

- (1) MASSIVE has ~ 842k utterances with domain- intent-slot (accross multiple locales)
    - On a portion of the data, use mt-5 tokenizer (trained on multiple languages) to get word vectors,
    - Run FAISS search on the word vectors to find top-k (top 5) most similar sentences.
- (2) train a paraphrase detection model with labeled data - the portion of data that is not used for FAISS search above as the step above is imitating unlabeled live traffic. paraphrase detection could be defined as binary classification task, 2 sentences are paraphrases if they are the same domain-intent-locale and share some important part of slots.
    - Another approach is to use off the shelf paraphrase detector like Roberta or Internal Amz model. (If using Amz model, we need to filter the pairs by locale as MASSIVE combines all locales together as 1 dataset).

- (3) use paraphrase detector from step (2) to filter the pairs in step (1).

- (4) from the positive pairs, train an mt5 model for sequence-to-sequence task of generating paraphrases.





- Train a target domain classification model

In [27]:
# load pretrained model available here https://huggingface.co/bert-base-multilingual-cased
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=18)

loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /home/grad3/hle/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_t

In [28]:
# define metric & tokenizer
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def tokenize_function(examples):
    return tokenizer(examples["annot_utt"], padding="max_length", truncation=True)


In [31]:
tokenized_datasets = val_dataset.map(tokenize_function, batched=True)


Loading cached processed dataset at /scratch/hle/git/MASSIVE/.dev/cache-5315223df6cd7bdf.arrow


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [33]:
# adding labels column to dataset
val_labels = [val_to_key[item['domain']] for item in tokenized_datasets]
tokenized_datasets = tokenized_datasets.add_column('labels', val_labels)

In [36]:
# split train test, using the val dataset as training dataset
train_val_split = tokenized_datasets.train_test_split(test_size=0.1)

In [37]:
train_val_split['train']

Dataset({
    features: ['id', 'locale', 'domain', 'intent_str', 'annot_utt', 'utt', 'slots_str', 'slots_num', 'intent_num', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 93314
})

In [38]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_val_split['train'],
    eval_dataset=train_val_split['test'],
    compute_metrics=compute_metrics,
)


In [39]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: slots_str, intent_str, id, locale, slots_num, utt, domain, intent_num, annot_utt. If slots_str, intent_str, id, locale, slots_num, utt, domain, intent_num, annot_utt are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 93314
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 5835


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4256,0.298492,0.908959
2,0.2102,0.23129,0.930755
3,0.1096,0.217515,0.940689


Saving model checkpoint to test_trainer/checkpoint-500
Configuration saved in test_trainer/checkpoint-500/config.json
Model weights saved in test_trainer/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1000
Configuration saved in test_trainer/checkpoint-1000/config.json
Model weights saved in test_trainer/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_trainer/checkpoint-1500
Configuration saved in test_trainer/checkpoint-1500/config.json
Model weights saved in test_trainer/checkpoint-1500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: slots_str, intent_str, id, locale, slots_num, utt, domain, intent_num, annot_utt. If slots_str, intent_str, id, locale, slots_num, utt, domain, intent_num, annot_utt are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluati

TrainOutput(global_step=5835, training_loss=0.30824625298010555, metrics={'train_runtime': 3849.8453, 'train_samples_per_second': 72.715, 'train_steps_per_second': 1.516, 'total_flos': 7.366641627595162e+16, 'train_loss': 0.30824625298010555, 'epoch': 3.0})

In [40]:
# Evaluate on test dataset
test_tokenized_datasets = test_dataset.map(tokenize_function, batched=True)
test_labels = [val_to_key[item['domain']] for item in test_tokenized_datasets]
test_tokenized_datasets = test_tokenized_datasets.add_column('labels', test_labels)

  0%|          | 0/152 [00:00<?, ?ba/s]

In [41]:
trainer.evaluate(test_tokenized_datasets)

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: slots_str, intent_str, id, locale, slots_num, utt, domain, intent_num, annot_utt. If slots_str, intent_str, id, locale, slots_num, utt, domain, intent_num, annot_utt are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 151674
  Batch size = 8


{'eval_loss': 0.7761733531951904,
 'eval_accuracy': 0.851431359362844,
 'eval_runtime': 777.2571,
 'eval_samples_per_second': 195.14,
 'eval_steps_per_second': 4.066,
 'epoch': 3.0}

In [43]:
trainer.evaluate(train_val_split['test'])

The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: slots_str, intent_str, id, locale, slots_num, utt, domain, intent_num, annot_utt. If slots_str, intent_str, id, locale, slots_num, utt, domain, intent_num, annot_utt are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10369
  Batch size = 8


{'eval_loss': 0.2175145000219345,
 'eval_accuracy': 0.9406885909923811,
 'eval_runtime': 53.7744,
 'eval_samples_per_second': 192.824,
 'eval_steps_per_second': 4.035,
 'epoch': 3.0}