In [9]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.data import Field, BucketIterator, TabularDataset
import pandas as pd
import model as MT
import string
import numpy as np
import re

%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
df = pd.read_csv('data/traindf.csv')
df.head()

Unnamed: 0,id,content
0,17283,WASHINGTON — Congressional Republicans have...
1,17284,"After the bullet shells get counted, the blood..."
2,17285,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,17286,"Death may be the great equalizer, but it isn’t..."
4,17287,"SEOUL, South Korea — North Korea’s leader, ..."


In [5]:
df.shape

(114056, 2)

In [10]:
TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"),
                            init_token='<sos>',
                            eos_token='<eos>',
                            lower=True)

In [12]:
data_text = TabularDataset(
    path='data/valdf.csv', format='csv',
    fields=[('id', None),('content',TEXT)])

In [30]:
data_text

NameError: name 'data_text' is not defined

In [13]:
TEXT.build_vocab(data_text, vectors="glove.6B.100d")

In [14]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
data_list = []
for example in data_text.examples:
    data_list.extend(example.content)


Starting from sequential data, the batchify() function arranges the dataset into columns, trimming off any tokens remaining after the data has been divided into batches of size batch_size. For instance, with the alphabet as the sequence (total length of 26) and a batch size of 4, we would divide the alphabet into 4 sequences of length 6:

In [16]:
def batchify(data, bsz):
    data = TEXT.numericalize([data])
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

batch_size = 64
batched_data = batchify(data_list, batch_size)

In [18]:
batched_data.shape

torch.Size([270543, 64])

In [19]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [20]:
data,target = get_batch(batched_data, 1)
ou = [TEXT.vocab.itos[i] for i in data[0,:10]]
ou

['hillary', 'can', 'here', 'an', 'than', '.', 'tuna', ')', 'is', 'being']

In [12]:
batched_data.shape

torch.Size([14662, 64])

In [488]:
ntokens = len(TEXT.vocab) # the size of vocabulary
emsize = 100 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = MT.TransformerModel(ntokens, emsize, nhead, nhid, nlayers, TEXT.vocab, dropout).to(device)

In [493]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab)
    for batch, i in enumerate(range(0, batched_data.size(0) - 1, bptt)):
        data, targets = get_batch(batched_data, i)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 25
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(batched_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [494]:
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    if epoch % 3 == 0:
        torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': criterion,
                'n_layers': nlayers,
                'n_hidden':nhid,
                'vocab':TEXT.vocab
                }, f'./model_{epoch}_transformer.net')
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

| epoch   1 |    25/  418 batches | lr 5.00 | ms/batch 1765.47 | loss  9.06 | ppl  8568.35
| epoch   1 |    50/  418 batches | lr 5.00 | ms/batch 1669.00 | loss  8.46 | ppl  4708.56
| epoch   1 |    75/  418 batches | lr 5.00 | ms/batch 1672.68 | loss  8.15 | ppl  3465.62
| epoch   1 |   100/  418 batches | lr 5.00 | ms/batch 1650.22 | loss  8.07 | ppl  3195.46
| epoch   1 |   125/  418 batches | lr 5.00 | ms/batch 1687.56 | loss  7.82 | ppl  2489.16
| epoch   1 |   150/  418 batches | lr 5.00 | ms/batch 1687.81 | loss  7.60 | ppl  1997.17
| epoch   1 |   175/  418 batches | lr 5.00 | ms/batch 1639.03 | loss  7.49 | ppl  1798.03
| epoch   1 |   200/  418 batches | lr 5.00 | ms/batch 1631.71 | loss  7.45 | ppl  1726.74
| epoch   1 |   225/  418 batches | lr 5.00 | ms/batch 1637.90 | loss  7.25 | ppl  1401.46
| epoch   1 |   250/  418 batches | lr 5.00 | ms/batch 1641.62 | loss  7.19 | ppl  1329.42
| epoch   1 |   275/  418 batches | lr 5.00 | ms/batch 1665.08 | loss  7.07 | ppl  1177.61

In [495]:
def sample(model,input_st):
    data = TEXT.numericalize([input_st.split(' ')])
    data.to(device)
    with torch.no_grad():
        pred = model(data)
        out = torch.topk(pred, 1)
        return out

In [496]:
vocab = TEXT.vocab.itos
out = sample(model,'hey')

In [497]:

def generate(model, input_sent, length_of_par):
    out_sent = []
    model.eval()
    for i in range(length_of_par):
        out = sample(model,input_sent)
        next_word = TEXT.vocab.itos[out.indices[-1]]
        input_sent += ' ' + next_word

    return input_sent
    

In [13]:
output = generate(model,'I like cucumber', 100)

NameError: name 'generate' is not defined

In [499]:
output

'United states and other countries were not in the united states and other countries and other countries were not going to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be able to be the first time of the first time of the first time of the first time of the first time of the first time'