In [1]:
# !pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.0-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m764.7 kB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>

In [2]:
import transformers
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import pandas as pd
import numpy as np
import evaluate
import torch
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
train = load_dataset('dair-ai/emotion', 'split', split='train')
valid = load_dataset('dair-ai/emotion', 'split', split='validation')
test = load_dataset('dair-ai/emotion', 'split', split='test')
print('size of train: {}, validation: {}, test: {}'.format(len(train), len(valid), len(test)))

Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading and preparing dataset emotion/split to /root/.cache/huggingface/datasets/dair-ai___emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset emotion downloaded and prepared to /root/.cache/huggingface/datasets/dair-ai___emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd. Subsequent calls will reuse this data.




size of train: 16000, validation: 2000, test: 2000


In practice, uneven sample sizes could also happen.

# Preprocessing

In [4]:
train[0]

{'text': 'i didnt feel humiliated', 'label': 0}

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
def tokenizing(record):
    return tokenizer(record['text'], truncation=True, max_length=300)

train_tokenized = train.map(tokenizing, batched=True)
valid_tokenized = valid.map(tokenizing, batched=True)
test_tokenized = test.map(tokenizing, batched=True)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [8]:
train_tokenized[0]

{'text': 'i didnt feel humiliated',
 'label': 0,
 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

# Define metrics

In [10]:
accuracy = evaluate.load('accuracy')
f1 = evaluate.load('f1')
precision = evaluate.load('precision')
recall = evaluate.load('recall')
def metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    ret = dict()
    ret.update(accuracy.compute(predictions=predictions, references=labels))
    ret.update(f1.compute(predictions=predictions, references=labels, average="macro"))
    ret.update(precision.compute(predictions=predictions, references=labels, average='macro'))
    ret.update(recall.compute(predictions=predictions, references=labels, average='macro'))
    return ret

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

# Define model

In [11]:
vocab_size = len(tokenizer.get_vocab())
model_cfg = transformers.BertConfig(
    vocab_size=vocab_size,
    num_labels=6
)
MyBert = BertForSequenceClassification(model_cfg)

# Set training arguments

In [16]:
args = TrainingArguments(
    output_dir='./output',
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    optim="adamw_torch",
    num_train_epochs=10,
    weight_decay=0.2,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch'
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='pt')
trainer = Trainer(
    model=MyBert,
    args=args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    compute_metrics=metrics
)

# Train

In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.106419,0.5955
2,No log,0.676108,0.771
3,No log,0.348282,0.8775
4,0.682600,0.319398,0.8875
5,0.682600,0.282032,0.891
6,0.682600,0.274547,0.897
7,0.682600,0.270564,0.899
8,0.181900,0.27804,0.901
9,0.181900,0.27755,0.899
10,0.181900,0.27575,0.8985


TrainOutput(global_step=1250, training_loss=0.36957623443603516, metrics={'train_runtime': 1591.545, 'train_samples_per_second': 100.531, 'train_steps_per_second': 0.785, 'total_flos': 4956400949520384.0, 'train_loss': 0.36957623443603516, 'epoch': 10.0})

In [18]:
# model_trained = BertForSequenceClassification.from_pretrained('./output/checkpoint-1000/')

# Evaluate on Test set

In [19]:
pred = trainer.predict(test_tokenized)
pred

PredictionOutput(predictions=array([[ 7.0165949e+00, -1.6746609e+00, -1.8239363e+00,  1.5467220e-03,
        -1.2399427e+00, -2.0806143e+00],
       [ 6.6876230e+00, -1.1944501e+00, -2.4416435e+00, -1.0156672e+00,
        -8.3823971e-02, -1.7264107e+00],
       [ 6.7442532e+00, -1.7427825e+00, -1.5358291e+00,  7.6650566e-01,
        -1.1692830e+00, -2.9643319e+00],
       ...,
       [-2.6119847e+00,  7.1800365e+00, -9.8436677e-01, -1.6442600e+00,
        -2.0482655e+00, -4.8150644e-01],
       [-1.4948800e+00,  7.2474575e+00, -5.2126676e-01, -2.0270472e+00,
        -2.2380035e+00, -1.4791368e+00],
       [-6.8568510e-01, -2.6536300e+00, -1.7572117e+00, -2.1163754e+00,
         4.0461621e+00,  3.7217915e+00]], dtype=float32), label_ids=array([0, 0, 0, ..., 1, 1, 4]), metrics={'test_loss': 0.25778356194496155, 'test_accuracy': 0.892, 'test_runtime': 6.606, 'test_samples_per_second': 302.756, 'test_steps_per_second': 2.422})

In [27]:
pred.predictions[:10]

array([[ 7.0165949e+00, -1.6746609e+00, -1.8239363e+00,  1.5467220e-03,
        -1.2399427e+00, -2.0806143e+00],
       [ 6.6876230e+00, -1.1944501e+00, -2.4416435e+00, -1.0156672e+00,
        -8.3823971e-02, -1.7264107e+00],
       [ 6.7442532e+00, -1.7427825e+00, -1.5358291e+00,  7.6650566e-01,
        -1.1692830e+00, -2.9643319e+00],
       [-1.8422629e+00,  7.0467124e+00, -1.0968795e+00, -1.4894482e+00,
        -1.3538961e+00, -2.1228695e+00],
       [ 6.8569651e+00, -1.4998029e+00, -2.2594230e+00, -7.6733273e-01,
        -4.9236500e-01, -1.5739446e+00],
       [-1.1266652e+00,  5.5088532e-01, -2.6890302e+00, -1.3966659e+00,
         5.7976651e+00, -1.1004949e+00],
       [ 1.4859027e+00, -1.9868534e+00, -2.2421050e+00,  5.8768916e+00,
        -1.6447313e+00, -2.1945164e+00],
       [-1.8021313e+00,  4.6182923e+00,  3.0606413e+00, -5.6828356e-01,
        -3.1741593e+00, -3.0398793e+00],
       [-2.1408687e+00,  7.1573124e+00, -1.1566513e+00, -1.4426515e+00,
        -2.3147523e+00, 

In [30]:
torch.argmax(torch.tensor(pred.predictions[:20]), axis=1)

tensor([0, 0, 0, 1, 0, 4, 3, 1, 1, 3, 4, 0, 0, 1, 2, 0, 1, 0, 3, 1])

In [31]:
test_tokenized['label'][:20]

[0, 0, 0, 1, 0, 4, 3, 1, 1, 3, 4, 0, 4, 1, 2, 0, 1, 0, 3, 1]