In [None]:
import torch
import torchtext
from torchtext.datasets import text_classification
NGRAMS = 3
import os
if not os.path.isdir('./.data'):
    os.mkdir('./.data')
train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./.data', ngrams=NGRAMS, vocab=None)
BATCH_SIZE = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_dataset

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class TextSentiment(nn.Module):
    
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [None]:
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUN_CLASS = len(train_dataset.get_labels())
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS).to(device)

In [None]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

In [None]:
from torch.utils.data import DataLoader

def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
    
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)

In [None]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 5
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = random_split(train_dataset, [train_len, len(train_dataset) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

In [None]:
print('Checking the results of test dataset...')
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

In [None]:
import re
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer

ag_news_label = {1 : "World",
                 2 : "Sports",
                 3 : "Business",
                 4 : "Sci/Tec"}

def predict(text, model, vocab, ngrams):
    tokenizer = get_tokenizer("basic_english")
    
    with torch.no_grad():

        text = torch.tensor([vocab[token] for token in ngrams_iterator(tokenizer(text), ngrams)])
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item() + 1

ex_text_str = "So no one has won yet?No … The race goes on. There is another set of primaries next week which Sanders hopes will give him another boost.But it may be that Sanders and Biden now slug it out state by state, week after week, all the way to the Democratic convention in July.If no candidate has got a majority of delegates by the time of the convention, party bigwigs get to play a role. Most of them are terrified of Sanders winning and they are likely to give the crown to Biden – even if Sanders is ahead at that point.If that happens, Sanders’ supporters will be absolutely furious and many may refuse to vote for Biden in the general election.What happened to Mike Bloomberg?The former New York mayor has spent about half a billion dollars on his campaign ads so far – a record-breaking amount. But he’s worth $60bn. To put that in context, that’s the equivalent of someone whose net worth is $10 grand, say, spending $1 on the election. It’s nowhere close to breaking the bank for him.That said, there will be a great deal of pressure on him now to give way and back Biden too. He won his first two elections as mayor of New York as a Republican and he is not a fan of Sanders and his socialist platform, to put it mildly.He is apparently planning to hunker down with his advisers on Wednesday and decide what to do next.What about Elizabeth Warren?On paper Warren could have been the ideal candidate to unite the liberal and centrist wings of the party but she only really cut through in her powerful attacks on Bloomberg. She has been squeezed out by Sanders on the left, and it’s hard to escape the conclusion that sexism has also played a role.Her campaign suggested last week that she would keep fighting and hope to be handed the crown at the convention if Sanders fails to win a majority of delegates. But that may be untenable and after coming third in her home state this is probably the end of the road for her.The big question is whether she will throw her weight behind Sanders or Biden. Liberals have often fantasised about Sanders making Warren his vice-president and you’ll hear a lot about that again now.Whether he will or not I don’t know, but I’d say it’s a near-certainty that both Sanders and Biden intend to choose a female VP if they get that far.What does Trump think?Donald Trump has shown a keen interest in the Democratic race and kept up a running commentary on Twitter. He constantly tears into Biden and Bloomberg on Twitter, but is much kinder to Sanders – stirring the pot by claiming that the Democratic establishment is rigging the race against the Vermont socialist. It seems clear that he wants Sanders to win the race and thinks he’d beat him.So when will the Democrats decide on a nominee?The primaries continue next week with contests in Michigan, Mississippi and Washington state, among other places. But this is looking like a battle between Bernie and Biden that will go down to the wire – so we may still be talking about this contest in July."

vocab = train_dataset.get_vocab()
model = model.to("cpu")

print("This is a %s news" %ag_news_label[predict(ex_text_str, model, vocab, 2)])

In [None]:
predict(ex_text_str, model, vocab, 2)