In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import pandas as pd
import numpy as np
import evaluate
import torch
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train = load_dataset('dair-ai/emotion', 'split', split='train')
valid = load_dataset('dair-ai/emotion', 'split', split='validation')
test = load_dataset('dair-ai/emotion', 'split', split='test')
print('size of train: {}, validation: {}, test: {}'.format(len(train), len(valid), len(test)))

In [None]:
df = pd.DataFrame(train)
df['label'].hist(bins=5)

In [None]:
df['label'].value_counts()

In practice, uneven sample sizes could also happen.

# Preprocessing

In [None]:
train[0]

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

For most of the case, 300 words are long enough to write a comment. Even if not, the main idea definitely appears in the first 300 words. As to text classification, it's ok to cut out some sentences, as long as the main points left.


In [None]:
type(train['text'])

In [None]:
def tokenizing(record):
    return tokenizer(record['text'], truncation=True, max_length=300)

train_tokenized = train.map(tokenizing, batched=True)
valid_tokenized = valid.map(tokenizing, batched=True)
test_tokenized = test.map(tokenizing, batched=True)

In [None]:
train_tokenized[0]

In [None]:
id2label = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}
label2id = {v: k for k, v in id2label.items()}

# Define metrics

In [None]:
accuracy = evaluate.load('accuracy')
def metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Load Pre-trained Model

In [None]:
bert = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=6,
    id2label=id2label,
    label2id=label2id
)

# Set training arguments

In [None]:
args = TrainingArguments(
    output_dir='./output',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.2,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps'
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='pt')
trainer = Trainer(
    model=bert,
    args=args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    compute_metrics=metrics
)

# Train

In [None]:
trainer.train()

In [None]:
model_trained = BertForSequenceClassification.from_pretrained('./output/checkpoint-500/')

# Evaluate on Test set

In [None]:
trainer.predict(test_tokenized)