# Aim
In this notebook, I'll use a TransformerEncoderLayer to perform text classification task. For simplicity, I'll directly use the pre-trained tokenizer since this doesn't affect the classification very much.

In [157]:
# !pip install datasets transformers evaluate torchtext==0.6

In [158]:
import torch
from torch import nn, Tensor
from torch.nn.modules import TransformerEncoderLayer, TransformerEncoder
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torchtext import data
import evaluate
import numpy as np
from datasets import load_dataset
import time
import math
from tempfile import TemporaryDirectory
import os
import random
from typing import Tuple

In [159]:
BATCH_SIZE = 64
TOKEN_LENGTH = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
random.seed(42)

# Load the dataset

In [160]:
train = load_dataset('dair-ai/emotion', 'split', split='train')
valid = load_dataset('dair-ai/emotion', 'split', split='validation')
test = load_dataset('dair-ai/emotion', 'split', split='test')
print('size of train: {}, validation: {}, test: {}'.format(len(train), len(valid), len(test)))

Found cached dataset emotion (/Users/xinglanl/.cache/huggingface/datasets/dair-ai___emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)
Found cached dataset emotion (/Users/xinglanl/.cache/huggingface/datasets/dair-ai___emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)
Found cached dataset emotion (/Users/xinglanl/.cache/huggingface/datasets/dair-ai___emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)


size of train: 16000, validation: 2000, test: 2000


# Pre-process data with pre-trained model

In [161]:
# [p.values() for p in train.to_iterable_dataset()]

In [162]:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [163]:
# tokenizer(train['text'][0])
train

Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})

In [164]:
TEXT = data.Field(sequential=True, lower=True, tokenize=str.split, batch_first=True, fix_length=TOKEN_LENGTH)
LABEL = data.LabelField(sequential=False, use_vocab=False)
fields = [('text', TEXT), ('label', LABEL)]
# examples = [data.Example.fromlist([k, v], fields) for k,v in zip(train['text'], train['label'])]
# print(type(examples[0]))
# print(examples[0].text)
# print(examples[0].label)
class DataFrameDataset(data.Dataset):

    def __init__(self, df, text_field, label_field, is_test=False, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for i, row in df.iterrows():
            label = row.label
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)
train_dataset = DataFrameDataset(train.to_pandas(), TEXT,LABEL)
valid_dataset = DataFrameDataset(valid.to_pandas(), TEXT,LABEL)
test_dataset = DataFrameDataset(test.to_pandas(), TEXT,LABEL)
new_corpus = [e.text for e in train_dataset.examples]

In [165]:
TEXT.build_vocab(new_corpus)
LABEL.build_vocab(train['label'])
r = TEXT.process(train['text'][1000].split())
r.shape

torch.Size([26, 10])

In [166]:
print("Size of TEXT vocabulary:",len(TEXT.vocab))
print("Size of LABEL vocabulary:",len(LABEL.vocab))

Size of TEXT vocabulary: 15214
Size of LABEL vocabulary: 6


In [167]:
train_iterator,valid_iterator,test_iterator= data.BucketIterator.splits(
    (train_dataset, valid_dataset,test_dataset),
    batch_size = BATCH_SIZE,
    sort =False,
    shuffle=False)
[ b for b in test_iterator]

[
 [torchtext.data.batch.Batch of size 64]
 	[.text]:[torch.LongTensor of size 64x10]
 	[.label]:[torch.LongTensor of size 64],
 
 [torchtext.data.batch.Batch of size 64]
 	[.text]:[torch.LongTensor of size 64x10]
 	[.label]:[torch.LongTensor of size 64],
 
 [torchtext.data.batch.Batch of size 64]
 	[.text]:[torch.LongTensor of size 64x10]
 	[.label]:[torch.LongTensor of size 64],
 
 [torchtext.data.batch.Batch of size 64]
 	[.text]:[torch.LongTensor of size 64x10]
 	[.label]:[torch.LongTensor of size 64],
 
 [torchtext.data.batch.Batch of size 64]
 	[.text]:[torch.LongTensor of size 64x10]
 	[.label]:[torch.LongTensor of size 64],
 
 [torchtext.data.batch.Batch of size 64]
 	[.text]:[torch.LongTensor of size 64x10]
 	[.label]:[torch.LongTensor of size 64],
 
 [torchtext.data.batch.Batch of size 64]
 	[.text]:[torch.LongTensor of size 64x10]
 	[.label]:[torch.LongTensor of size 64],
 
 [torchtext.data.batch.Batch of size 64]
 	[.text]:[torch.LongTensor of size 64x10]
 	[.label]:[torch.

In [168]:
# def tokenizing(record):
#
#     return tokenizer(record['text'], truncation=True, max_length=300)
#
#
# train_tokenized = train.map(tokenizing, batched=True)
# valid_tokenized = valid.map(tokenizing, batched=True)
# test_tokenized = test.map(tokenizing, batched=True)

In [169]:
# def func(l):
#     t = torch.Tensor(7)
#     t[l] = 1
#     return t
#
#
# print(train_tokenized['label'][:2])
# [func(l) for l in train_tokenized['label'][:2]]

In [170]:
# train_tokenized[0]

In [171]:
# id2label = {
#     0: 'sadness',
#     1: 'joy',
#     2: 'love',
#     3: 'anger',
#     4: 'fear',
#     5: 'surprise'
# }
# label2id = {v: k for k, v in id2label.items()}

# Define metrics

In [172]:
accuracy = evaluate.load('accuracy')


def metrics(pred, true):
    predictions = np.argmax(pred, axis=1)
    return accuracy.compute(predictions=predictions, references=true)

# Define model

In [173]:
class TransformerModel(nn.Module):

    def __init__(self, d_model: int, nhead: int, d_hid: int, nlayers: int, out_features: int,
                in_between: int = 128, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer text classificator'
        self.linear_in = TOKEN_LENGTH * d_model
        self.encoder = nn.Embedding(len(TEXT.vocab), d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.linear1 = nn.modules.Linear(self.linear_in,   in_between)
        self.output = nn.modules.Linear( in_between, out_features)
        self.d_model = d_model

    def forward(self, src: Tensor, mask: Tensor, padding_masks: Tensor) -> Tensor:
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        x = self.transformer_encoder(src, mask=mask, src_key_padding_mask=padding_masks)
        x = x.reshape(-1, self.linear_in)
        x = F.relu(self.linear1(x))
        output = self.output(x)
        return output

In [174]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# training arguments

In [175]:
emsize = 300  # embedding dimension
d_hid = 200  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
out_features = 6  # num of categories
nhead = 2  # number of heads in nn.MultiheadAttention
dropout = 0  # dropout probability
model = TransformerModel(emsize, nhead, d_hid, nlayers, out_features, dropout=dropout).to(device)
model

TransformerModel(
  (encoder): Embedding(15214, 300)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
        )
        (linear1): Linear(in_features=300, out_features=200, bias=True)
        (dropout): Dropout(p=0, inplace=False)
        (linear2): Linear(in_features=200, out_features=300, bias=True)
        (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0, inplace=False)
        (dropout2): Dropout(p=0, inplace=False)
      )
    )
  )
  (linear1): Linear(in_features=3000, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=6, bias=True)
)

In [176]:
def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [177]:
def reshape_embeddings(dataset):
    embeddings = [torch.tensor(l, dtype=torch.float) for l in dataset['input_ids']]
    # embeddings.append(torch.zeros(emsize))    # add extract tensor to control the length of each text
    # embeddings = pad_sequence(embeddings, batch_first=True)[:-1]    # remove the controal tensor
    return embeddings


# def reshape_embeddings(dataset):
#     for l in dataset['input_ids']:

# train_embeddings = reshape_embeddings(train_tokenized)
# valid_embeddings = reshape_embeddings(valid_tokenized)
# test_embeddings = reshape_embeddings(test_tokenized)
# train_labels = torch.tensor(train_tokenized['label'])
# valid_labels = torch.tensor(valid_tokenized['label'])
# test_labels = torch.tensor(test_tokenized['label'])

In [187]:
criterion = nn.CrossEntropyLoss()
lr = 1e-3  # learning rate
optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)


def train(model: nn.Module, epoch: int, dataset: data.Iterator) -> None:
    model.train()  # turn on train mode
    # total_loss = 0.
    # log_interval = 10
    # start_time = time.time()
    # src_mask = generate_square_subsequent_mask(BATCH_SIZE).to(device)

    for batch in dataset:
        texts = batch.text
        labels = batch.label
        outputs = model(texts.to(device), None, None)
        loss = criterion(outputs, labels.to(device))

        optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        if dataset.iterations % 100 == 0:
            loss, current = loss.item(), (dataset.iterations + 1) * len(batch)
            # print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            print('loss: {:.7f} progress: [{:>5d}/{:>5d}]'.format(loss, current, len(dataset.dataset)) )
        # total_loss += loss.item()
        # if dataset.iterations % log_interval == 0 and dataset.iterations > 0:
        #     # lr = scheduler.get_last_lr()[0]
        #     ms_per_batch = (time.time() - start_time) * 1000 / log_interval
        #     cur_loss = total_loss / log_interval
        #     ppl = math.exp(cur_loss)
        #     print(f'| epoch {epoch:3d} | {(dataset.iterations):5d}/{len(dataset):5d} batches | '
        #           f'lr {lr:04.4f} | ms/batch{ms_per_batch:5.2f} | '
        #           f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
        #     total_loss = 0
        #     start_time = time.time()


def evaluate(model: nn.Module, dataset: data.Iterator) -> Tuple[float, float, list, list]:
    model.eval()  # turn on evaluation mode
    total_loss, acc = 0.0, 0.0
    pred_y_arr = list()
    comparison_arr = list()
    src_mask = generate_square_subsequent_mask(BATCH_SIZE).to(device)
    with torch.no_grad():
        for i,batch in enumerate(dataset):
            texts = (batch.text).to(device)
            labels = (batch.label).to(device)
            # outputs = model(texts.to(device))
            outputs = model(texts, None, None)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            pred_y = outputs.argmax(1)
            pred_y_arr.append(pred_y)
            compare = (pred_y == labels).type(torch.float)
            comparison_arr.append(compare)
            acc += compare.sum().item()

        total_loss /= (i+1)
        acc /= len(dataset.dataset)
        print('Evaluate: avg loss: {:8>f} , accuracy: {:0.1f}'.format(total_loss, 100*acc))
    return total_loss, acc, pred_y_arr, comparison_arr


# Train

In [188]:
best_val_loss = float('inf')
epochs = 80

with TemporaryDirectory() as tempdir:
    best_model_params_path = os.path.join(tempdir, "best_model_params.pt")

    for epoch in range(1, epochs + 1):
        print('-'*30 + 'epoch: {}'.format(epoch) + '-'*30 )
        epoch_start_time = time.time()
        train(model, epoch, train_iterator)
        val_loss, *_ = evaluate(model, valid_iterator)

        elapsed = time.time() - epoch_start_time
        print('epoch elapsed time: {}'.format(elapsed))

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_params_path)

        # scheduler.step()
    model.load_state_dict(torch.load(best_model_params_path))  # load best model states

------------------------------epoch: 1------------------------------
loss: 0.9334722 progress: [ 6464/16000]
loss: 0.9806864 progress: [12864/16000]
Evaluate: avg loss: 1.479408 , accuracy: 47.1
epoch elapsed time: 5.700689315795898
------------------------------epoch: 2------------------------------
loss: 0.7278088 progress: [ 6464/16000]
loss: 0.8074391 progress: [12864/16000]
Evaluate: avg loss: 1.506570 , accuracy: 48.4
epoch elapsed time: 5.538240194320679
------------------------------epoch: 3------------------------------
loss: 0.6265860 progress: [ 6464/16000]
loss: 0.6763330 progress: [12864/16000]
Evaluate: avg loss: 1.536326 , accuracy: 49.1
epoch elapsed time: 5.591440916061401
------------------------------epoch: 4------------------------------
loss: 0.5430669 progress: [ 6464/16000]
loss: 0.5861725 progress: [12864/16000]
Evaluate: avg loss: 1.570173 , accuracy: 49.5
epoch elapsed time: 5.604636907577515
------------------------------epoch: 5------------------------------

In [189]:
*_, comp = evaluate(model, test_iterator)
comp[0]

Evaluate: avg loss: 1.421456 , accuracy: 49.1


tensor([0., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0.,
        0., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1.,
        1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1.,
        0., 0., 0., 1., 1., 0., 1., 1., 0., 0.])

In [183]:
torch.save(model, 'output/encoder-03-15')