In [None]:
import torch
from typing import Callable, List, Tuple, Dict
from torchtext.datasets import AG_NEWS

print(f'GPU is available {torch.cuda.is_available()}')

BATCH_SIZE = 8
EPOCHS = 10
LR = 64

In [None]:
# Get iterator for training data
train_iter = iter(AG_NEWS(split='train'))
next(train_iter)

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

# Lookup table for labels
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

text_pipeline: Callable = lambda x: vocab(tokenizer(x))



In [None]:
vocab(['the', 'a', 'it', 'foo', 'hello', 'world'])

In [None]:
text_pipeline: Callable = lambda x: vocab(tokenizer(x))
label_pipeline: Callable = lambda x: int(x) - 1

In [None]:
from torch.utils.data import DataLoader
device = torch.device("cuda")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text_list)
    label = torch.tensor(label_list)
    return label.to(device), text.to(device), offsets.to(device)

train_iter = AG_NEWS(split='train')
dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

In [None]:
from torch import nn

class TextClassificationModel(nn.Module):
    
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

num_class = len(set([label for (label, text) in train_iter]))
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize, num_class).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
criterion = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.1)


In [None]:
import time

def train(dataloader, epoch):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 200

    start_time = time.time()
    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        # Epoch 
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

In [None]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset


total_acc = None
train_iter, test_iter = AG_NEWS()

train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

for epoch in range(1, EPOCHS+1):
    epoch_start_time = time.time()
    train(train_dataloader, epoch)
    valid_acc = evaluate(valid_dataloader)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid accuracy {:8.3f} '.format(epoch, (time.time() - epoch_start_time), valid_acc))
    print('-' * 89)
    scheduler.step()


In [None]:
def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        m = nn.Softmax(dim=1)
        output = m(output)
        return output.data.numpy()

my_text = "Programmers everywhere rejoice! PyTorch 420.69 is here! With this release, we are excited to announce that PyTorch now supports the latest CUDA 10.1 and cuDNN 7.6.1. This release also includes a number of bug fixes and performance improvements. We have also added a new torch.utils.bottleneck module to help you identify performance bottlenecks in your code. Check out the release notes for a full list of changes."

model.to("cpu")
print(predict(my_text, text_pipeline))

In [None]:
# Create sample tensors for input
text = "The gooblegobers won last night in a close game. The final score was 42 to 41."
text = torch.tensor(text_pipeline(text))
offsets = torch.tensor([0])

# Export model with ONNX
torch.onnx.export(
    model,
    (text, offsets),
    "/home/tren/dev/betterer/onnx/ag_news_model.onnx",
    export_params=True,
    opset_version=12,
    do_constant_folding=True,
    input_names=['text', 'offsets'],
    output_names=['label'],
    dynamic_axes={
        'text': {0: 'batch_size'},
        'label': {0: 'batch_size'},
    }
)


In [None]:
import onnx
onnx_model = onnx.load("/home/tren/dev/betterer/onnx/ag_news_model.onnx")
onnx.checker.check_model(onnx_model)
onnx.__version__

In [None]:
import torch
import onnxruntime as ort
import numpy as np

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import AG_NEWS

tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
text_pipeline = lambda x: vocab(tokenizer(x))

text = "The gooblegobers won last night in a close game. The final score was 42 to 41."
text = torch.tensor(text_pipeline(text))
offsets = torch.tensor([0])

ort_session = ort.InferenceSession("/home/tren/dev/betterer/onnx/ag_news_model.onnx")
outputs = ort_session.run(None, {"text": text.numpy(), "offsets": offsets.numpy()})

results = outputs[0]
print(results)