<a href="https://colab.research.google.com/github/harryypham/MyMLPractice/blob/main/nlp/refined_nbow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
!pip install datasets tokenizers tqdm



In [37]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tqdm import tqdm

In [68]:
train_data = load_dataset("google-research-datasets/go_emotions", "simplified", split="train")
val_data = load_dataset("google-research-datasets/go_emotions", "simplified", split="validation")
test_data = load_dataset("google-research-datasets/go_emotions", "simplified", split="test")

In [51]:
print(train_data[0])

{'text': "My favourite food is anything I didn't have to cook myself.", 'labels': [27], 'id': 'eebbqej'}


In [69]:
tokenizer = Tokenizer.from_pretrained("bert-base-uncased")

def tokenize(example):
  tokens = tokenizer.encode(example['text']).ids
  return {"tokens": tokens}

train_data = train_data.map(tokenize)
val_data = val_data.map(tokenize)
test_data = test_data.map(tokenize)

In [53]:
print(train_data)
print("length training data:", len(train_data))
print("length longest sequence:", len(max(train_data['tokens'], key=len)))
print("example training sample:")
print(train_data[0]['text'])
print(train_data[0]['tokens'])
print(type(train_data[0]['labels']))

Dataset({
    features: ['text', 'labels', 'id', 'tokens'],
    num_rows: 43410
})
length training data: 43410
length longest sequence: 316
example training sample:
My favourite food is anything I didn't have to cook myself.
[101, 2026, 8837, 2833, 2003, 2505, 1045, 2134, 1005, 1056, 2031, 2000, 5660, 2870, 1012, 102]
<class 'list'>


In [70]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
vocab_size = tokenizer.get_vocab_size()
emb_dim = 256
num_classes = 28

In [71]:
train_data = train_data.with_format(type="torch", columns=["tokens", "labels", "text"])
val_data = val_data.with_format(type="torch", columns=["tokens", "labels", "text"])
test_data = test_data.with_format(type="torch", columns=["tokens", "labels", "text"])

In [72]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

def collate_fn(batch, pad_index=0):
  batch_ids = [ex["tokens"] for ex in batch]
  batch_ids = nn.utils.rnn.pad_sequence(
      batch_ids, padding_value=pad_index, batch_first=True
  )
  batch_label = [ex["labels"][0] for ex in batch]
  batch_label = torch.stack(batch_label)
  batch_text = [ex["text"] for ex in batch]
  batch = {"ids": batch_ids, "labels": batch_label, "text": batch_text}
  return batch

def get_data_loader(data, batch_size, shuffle=False):
  data_loader = DataLoader(
        dataset=data,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
  return data_loader

batch_size = 32
train_loader = get_data_loader(train_data, batch_size, shuffle=True)
val_loader = get_data_loader(val_data, batch_size)
test_loader = get_data_loader(test_data, batch_size)

In [73]:
for batch in train_loader:
  print(batch['ids'].shape)
  print(batch['labels'].shape)
  print(len(batch['text']))
  break

torch.Size([32, 36])
torch.Size([32])
32


### Model

In [82]:
class NBoW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, pad_index=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        # self.embedding.weight.data.normal_()
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, ids, labels):
        # ids = [batch size, seq len]
        embedded = self.embedding(ids)
        # embedded = [batch size, seq len, embedding dim]
        pooled = embedded.mean(dim=1)
        # pooled = [batch size, embedding dim]
        logits = self.fc(pooled)
        # print(logits.shape)
        # prediction = [batch size, output dim]
        loss = F.cross_entropy(logits, labels)
        return logits, loss

In [83]:

model = NBoW(vocab_size, emb_dim, num_classes)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

In [84]:
def train(data_loader, model, optimizer, device):
    model.train()
    epoch_losses = []
    epoch_accs = []
    pbar = tqdm(data_loader, desc="training...")
    for batch in pbar:
        ids = batch["ids"].to(device)
        label = batch["labels"].to(device)
        logits, loss = model(ids, label)
        accuracy = get_accuracy(logits, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_losses.append(loss.item())
        epoch_accs.append(accuracy.item())
        pbar.set_postfix_str(f"train_loss: {sum(epoch_losses)/len(epoch_losses)}, train_acc: {sum(epoch_accs)/len(epoch_accs)}")

    return sum(epoch_losses)/len(epoch_losses), sum(epoch_accs)/len(epoch_accs)

def evaluate(data_loader, model, device):
    model.eval()
    epoch_losses = []
    epoch_accs = []
    with torch.no_grad():
        for i in tqdm(data_loader, desc="evaluating..."):
            ids = batch["ids"].to(device)
            label = batch["label"].to(device)
            logits, loss = model(ids, label)
            accuracy = get_accuracy(logits, label)
            epoch_losses.append(loss.item())
            epoch_accs.append(accuracy.item())
    model.train()
    return sum(epoch_losses)/len(epoch_losses), sum(epoch_accs)/len(epoch_accs)

def get_accuracy(prediction, label):
    batch_size, _ = prediction.shape
    predicted_classes = prediction.argmax(dim=-1)
    correct_predictions = predicted_classes.eq(label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy

In [85]:
import collections

n_epochs = 10
best_valid_loss = float("inf")

metrics = collections.defaultdict(list)

for epoch in range(n_epochs):
    train_loss, train_acc = train(
        train_loader, model, optimizer, device
    )
    valid_loss, valid_acc = evaluate(val_loader, model, device)
    metrics["train_losses"].append(train_loss)
    metrics["train_accs"].append(train_acc)
    metrics["valid_losses"].append(valid_loss)
    metrics["valid_accs"].append(valid_acc)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "nbow.pt")
    print(f"epoch: {epoch}")
    print(f"train_loss: {train_loss:.3f}, train_acc: {train_acc:.3f}")
    print(f"valid_loss: {valid_loss:.3f}, valid_acc: {valid_acc:.3f}")

training...:  19%|█▊        | 252/1357 [00:38<02:47,  6.59it/s, train_loss: 3.1047348900446816, train_acc: 0.24206349206349206]


KeyboardInterrupt: 