In [1]:
!pip install torchtext==0.8.0 --upgrade

Collecting torchtext==0.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/26/8a/e09b9b82d4dd676f17aa681003a7533765346744391966dec0d5dba03ee4/torchtext-0.8.0-cp37-cp37m-manylinux1_x86_64.whl (6.9MB)
[K     |████████████████████████████████| 7.0MB 4.7MB/s 
Installing collected packages: torchtext
  Found existing installation: torchtext 0.3.1
    Uninstalling torchtext-0.3.1:
      Successfully uninstalled torchtext-0.3.1
Successfully installed torchtext-0.8.0


In [4]:
import torch, torchtext
from torch import nn, optim, functional as F
from tqdm.auto import tqdm

In [5]:
train_ds, test_ds = torchtext.datasets.text_classification.DATASETS['AG_NEWS'](root='data', ngrams = 1, vocab = None)

ag_news_csv.tar.gz: 11.8MB [00:00, 59.5MB/s]
120000lines [00:05, 23869.48lines/s]
120000lines [00:09, 12233.65lines/s]
7600lines [00:00, 11393.19lines/s]


## Vocab:


In [8]:
vb = train_ds.get_vocab()
len(vb)

95812

In [9]:
vb.itos[:10]

['<unk>', '<pad>', '.', 'the', ',', 'to', 'a', 'of', 'in', 'and']

In [10]:
vb.stoi['security']

100

## Examining dataset:


In [13]:
torchtext.datasets.text_classification.LABELS['AG_NEWS']

{0: 'World', 1: 'Sports', 2: 'Business', 3: 'Sci/Tech'}

In [12]:
test_ds.get_labels()

{0, 1, 2, 3}

In [14]:
test_ds[100]

(1,
 tensor([ 314,  611,   12,  197,    4, 8234,   31,  705,  477,  198,  640,   33,
          250,    5,   33,  183,   13,   10,   48,  814, 3328,  314, 2730,  708,
           11,   58,    4,  224,    8,    3,  730, 1152,   31, 1593,    7, 3979,
           13,   10,  860,  822, 1359,    3,   48,  814, 7059,   30,    3,   89,
         1667, 5573,    2]))

In [15]:
' '.join([vb.itos[i] for i in test_ds[100][1]]) 

'olympic history for india , uae an indian army major shot his way to his country #39 s first ever individual olympic silver medal on tuesday , while in the same event an member of dubai #39 s ruling family became the first ever medallist from the united arab emirates .'

## Embeddings

testing out by creating an embedding for 10 words

In [16]:
emb = nn.Embedding(10, 3)

In [17]:
emb(torch.LongTensor([1, 2, 3, 3]))

tensor([[ 0.5790,  0.6583,  0.4305],
        [-0.6313,  1.4892, -1.1207],
        [ 0.1219, -1.4118,  0.7932],
        [ 0.1219, -1.4118,  0.7932]], grad_fn=<EmbeddingBackward>)

## Model


In [18]:
class TextCats(nn.Module):
  def __init__(self, n_words, emb_dim, n_cats):
    super().__init__()
    self.embedding = nn.Embedding(n_words, emb_dim)
    self.fc = nn.Linear(emb_dim, n_cats)
    nn.init.xavier_uniform_(self.embedding.weight.data)
    nn.init.xavier_uniform_(self.fc.weight.data)
  
  def forward(self, text):
    emb = self.embedding(text)
    return self.fc(emb)

Testing the model out:

In [19]:
model = TextCats(len(vb), 32, 4)

In [21]:
model(torch.LongTensor([1, 2, 3]))

tensor([[-0.1177,  0.0067, -0.0342, -0.1020],
        [-0.1141, -0.0030, -0.0303, -0.1025],
        [-0.0974,  0.0164, -0.0458, -0.1074]], grad_fn=<AddmmBackward>)

In [22]:
model(torch.LongTensor([1,2,3])).shape

torch.Size([3, 4])

This model gives us a vector corresponding to each word, but we want a vector that summarizes the input

## Model 2: with the fixes:

In [25]:
class TextCats(nn.Module):
    def __init__(self, n_words, emb_dim, n_cats):
        super().__init__()
        self.embedding = nn.Embedding(n_words, emb_dim)
        self.fc = nn.Linear(emb_dim, n_cats)
        nn.init.xavier_uniform_(self.embedding.weight.data)
        nn.init.xavier_uniform_(self.fc.weight.data)
    def forward(self, text):
        emb = self.embedding(text).mean(-2)
        return self.fc(emb)

In [26]:
model = TextCats(len(vb), 32, 4)

In [27]:
model(torch.LongTensor([1,2,3]))

tensor([ 0.1700,  0.1413,  0.1177, -0.0101], grad_fn=<AddBackward0>)

## Training code

In [33]:
device = torch.device('cpu')

In [32]:
def run_test(model, ds, crit):
  model.eval()
  total_loss, total_acc = 0, 0
  ldr = torch.utils.data.DataLoader(ds)
  for labs, text in tqdm(ldr, leave= False, desc = 'test_iter'):
    labs, text = labs.to(device), text.to(device)
    with torch.no_grad():
      outs = model(text)
      loss = crit(outs, labs)
      total_loss += loss.item()
      total_acc += (outs.argmax(1) == labs).sum().item()
  return total_loss / len(ds), total_acc/len(ds)

In [38]:
def run_train(model, ds, crit, opt, sched):
  model.train()
  total_loss, total_acc = 0, 0
  ldr = torch.utils.data.DataLoader(ds, shuffle=True)
  for labs, txts in tqdm(ldr, leave = False, desc = 'train iter'):
    opt.zero_grad()
    labs, txts = labs.to(device), txts.to(device)
    outs = model(txts)
    loss = crit(outs, labs)
    loss.backward()
    opt.step()
    total_loss += loss.item()
    total_acc += (outs.argmax(1) == labs).sum().item()
  sched.step()
  return total_loss / len(ds), total_acc / len(ds)



In [34]:
def run_all(model, test_ds, train_ds, crit, opt, sched, n_epochs=10):
    for epoch in tqdm(range(n_epochs), desc='epochs'):
        train_loss, train_acc = run_train(model, train_ds, crit, opt, sched)
        test_loss, test_acc = run_test(model, test_ds, crit)
        tqdm.write(f'epoch {epoch}   train loss {train_loss:.6f} acc {train_acc:.4f}   test loss {test_loss:.6f} acc {test_acc:.4f}')  

In [35]:
model = TextCats(len(vb), 32, 4)
model.to(device);

In [36]:
crit = nn.CrossEntropyLoss().to(device)
opt = optim.SGD(model.parameters(), lr=1.0)
sched = optim.lr_scheduler.StepLR(opt, 1, gamma=0.1)

## Training

In [39]:
run_all(model, test_ds, train_ds, crit, opt, sched, 3)

HBox(children=(FloatProgress(value=0.0, description='epochs', max=3.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='train iter', max=120000.0, style=ProgressStyle(descriptio…

KeyboardInterrupt: ignored

## Faster Model with EmbeddingBag

In [45]:
class TextCats(nn.Module):
  def __init__(self, n_words, emb_dim, n_cats):
    super().__init__()
    self.embedding = nn.EmbeddingBag(n_words, emb_dim, mode = 'mean', sparse = True)
    self.fc = nn.Linear(emb_dim, n_cats)
    nn.init.xavier_uniform_(self.embedding.weight.data)
    nn.init.xavier_uniform_(self.fc.weight.data)
  def forward(self, text):
    emb = self.embedding(text)
    return self.fc(emb)

In [46]:
model = TextCats(len(vb), 32, 4)
model.to(device)

TextCats(
  (embedding): EmbeddingBag(95812, 32, mode=mean)
  (fc): Linear(in_features=32, out_features=4, bias=True)
)

In [48]:
crit = nn.CrossEntropyLoss().to(device)
opt = optim.SGD(model.parameters(), lr = 1.0)
sched = optim.lr_scheduler.StepLR(opt, 1, gamma=0.1)

In [49]:
run_all(model, test_ds, train_ds, crit, opt, sched, 3)

HBox(children=(FloatProgress(value=0.0, description='epochs', max=3.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='train iter', max=120000.0, style=ProgressStyle(descriptio…

HBox(children=(FloatProgress(value=0.0, description='test_iter', max=7600.0, style=ProgressStyle(description_w…

epoch 0   train loss 0.443637 acc 0.8517   test loss 0.412031 acc 0.8647


HBox(children=(FloatProgress(value=0.0, description='train iter', max=120000.0, style=ProgressStyle(descriptio…

HBox(children=(FloatProgress(value=0.0, description='test_iter', max=7600.0, style=ProgressStyle(description_w…

epoch 1   train loss 0.222589 acc 0.9274   test loss 0.296008 acc 0.9013


HBox(children=(FloatProgress(value=0.0, description='train iter', max=120000.0, style=ProgressStyle(descriptio…

HBox(children=(FloatProgress(value=0.0, description='test_iter', max=7600.0, style=ProgressStyle(description_w…

epoch 2   train loss 0.202932 acc 0.9331   test loss 0.286375 acc 0.9103



## Model with batching(GPU)