# Borrowed from https://github.com/jobergum/browser-ml-inference/blob/main/TrainGoEmotions.ipynb

We use the small distilled BERT model from Microsoft as our pre-trained model which we fine-tune on the emotion classification task. 
See https://huggingface.co/microsoft/xtremedistil-l6-h256-uncased for details. Inspiration for dataset adjustment from https://colab.research.google.com/drive/1aue7x525rKy6yYLqqt-5Ll96qjQvpqS7#scrollTo=Dcw8-k4lO5Yk 

In [12]:
model_name = 'microsoft/xtremedistil-l6-h384-uncased'
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
from datasets import load_dataset
ds = load_dataset("go_emotions", "raw")

Found cached dataset go_emotions (/home/khoa/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)


  0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
emotions = [
 'admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [15]:
ds = ds.map(lambda x : {"labels": [x[c] for c in emotions]})

Loading cached processed dataset at /home/khoa/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d/cache-65620cbef30e65ac.arrow


In [16]:

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=64)

cols = ds["train"].column_names
cols.remove("labels")
ds_enc = ds.map(tokenize_function, batched=True, remove_columns=cols)
ds_enc

Loading cached processed dataset at /home/khoa/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d/cache-0a8fa908934cf47c.arrow


DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 211225
    })
})

In [17]:
import torch
ds_enc.set_format("torch")
ds_enc = (ds_enc
          .map(lambda x : {"float_labels": x["labels"].to(torch.float)}, remove_columns=["labels"])
          .rename_column("float_labels", "labels"))

Loading cached processed dataset at /home/khoa/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d/cache-d85f949d80d8620c.arrow


In [18]:
ds_enc['train'].features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)}

So finally done processing the dataset - now define the model and training parameters

In [19]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

cuda:0


In [20]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(emotions), problem_type="multi_label_classification")
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h384-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from transformers import TrainingArguments
training_args = TrainingArguments("test_trainer",
                                  per_device_train_batch_size=128, 
                                  num_train_epochs=4,learning_rate=3e-05,
                                  evaluation_strategy="no")
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_enc['train']
)

In [22]:
trainer.train()



  0%|          | 0/6604 [00:00<?, ?it/s]

{'loss': 0.2619, 'learning_rate': 2.7728649303452452e-05, 'epoch': 0.3}
{'loss': 0.1573, 'learning_rate': 2.5457298606904906e-05, 'epoch': 0.61}
{'loss': 0.1538, 'learning_rate': 2.318594791035736e-05, 'epoch': 0.91}
{'loss': 0.1469, 'learning_rate': 2.0914597213809815e-05, 'epoch': 1.21}
{'loss': 0.1417, 'learning_rate': 1.8643246517262266e-05, 'epoch': 1.51}
{'loss': 0.1369, 'learning_rate': 1.6371895820714717e-05, 'epoch': 1.82}
{'loss': 0.1328, 'learning_rate': 1.4100545124167173e-05, 'epoch': 2.12}
{'loss': 0.1292, 'learning_rate': 1.1829194427619624e-05, 'epoch': 2.42}
{'loss': 0.1273, 'learning_rate': 9.557843731072078e-06, 'epoch': 2.73}
{'loss': 0.1253, 'learning_rate': 7.286493034524531e-06, 'epoch': 3.03}
{'loss': 0.1246, 'learning_rate': 5.015142337976984e-06, 'epoch': 3.33}
{'loss': 0.1236, 'learning_rate': 2.7437916414294368e-06, 'epoch': 3.63}
{'loss': 0.1228, 'learning_rate': 4.724409448818898e-07, 'epoch': 3.94}
{'train_runtime': 570.2415, 'train_samples_per_second': 1

TrainOutput(global_step=6604, training_loss=0.14458808474364532, metrics={'train_runtime': 570.2415, 'train_samples_per_second': 1481.653, 'train_steps_per_second': 11.581, 'train_loss': 0.14458808474364532, 'epoch': 4.0})

Export PyTorch model to ONNX format for serving with ONNX Runtime Web 

In [23]:
import transformers
import transformers.convert_graph_to_onnx as onnx_convert
from pathlib import Path

In [24]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bergum/xtremedistil-l6-h384-go-emotion")
model = AutoModelForSequenceClassification.from_pretrained("bergum/xtremedistil-l6-h384-go-emotion")

Downloading (â€¦)okenizer_config.json:   0%|          | 0.00/365 [00:00<?, ?B/s]

Downloading (â€¦)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (â€¦)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (â€¦)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (â€¦)lve/main/config.json:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [25]:
pipeline = transformers.pipeline("text-classification",model=model,tokenizer=tokenizer)

In [32]:
pipeline("I am sad")

[{'label': 'sadness ðŸ˜ž', 'score': 0.2828780710697174}]

In [27]:
onnx_convert.convert_pytorch(pipeline, opset=11, output=Path("extreme-go-emotion.onnx"), use_external_format=False)

Using framework PyTorch: 2.0.0+cu117
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']
verbose: False, log level: Level.ERROR



In [28]:
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic("extreme-go-emotion.onnx", "extreme-go-emotion-int8.onnx", 
                 weight_type=QuantType.QUInt8)

Ignore MatMul due to non constant B: /[/bert/encoder/layer.0/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.0/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.1/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.1/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.2/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.2/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.3/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.3/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.4/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.4/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.5/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.5/atten