# Import and declaring certain arguments

In [3]:
# coding: utf-8
import argparse
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx
import data
import model
import torch.optim as optim

In [4]:

class Args:
  data = './data/wikitext-2'
  model = 'FNNModel'
  emsize = 200
  context_size = 8
  nhid = 200
  nlayers = 2
  lr = 20
  clip = 0.25
  epochs = 40
  batch_size = 8
  bptt = 35
  dropout = 0.2
  tied = True
  seed = 1111
  cuda = True
  log_interval = 200
  save = 'model.pt'
  onnx_export = ''
  nhead = 2
  dry_run =  True

args=Args()

In [5]:
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args.cuda else "cpu")



# Loading the data

In [6]:
###############################################################################
# Load data
###############################################################################

corpus = data.Corpus(args.data)

# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.


In [7]:
# we want to return a tensor with ascending batch 

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(-1, bsz).contiguous()
    return data.to(device)

eval_batch_size = 8
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

# Declaring data loader functions 

In [8]:
# get the train and target for the train values
def get_batch(source, i):
    seq_len = min(args.bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len]
    target = target.narrow(1,0,1).contiguous().view(-1)
    return data, target

In [9]:
data, target = get_batch(train_data, 0)

In [10]:
print(data)

tensor([[  0,   1,   2,   3,   4,   1,   0,   0],
        [  5,   6,   2,   7,   8,   9,   3,  10],
        [ 11,   8,  12,  13,  14,  15,   2,  16],
        [ 17,  18,   7,  19,  13,  20,  21,  22],
        [ 23,   2,   3,   4,  24,  25,  13,  26],
        [ 27,  28,  29,  30,  31,  32,  33,  34],
        [ 35,  36,  37,  38,  39,  17,  40,  41],
        [ 15,  42,  43,  44,  45,  43,  25,  13],
        [ 46,  26,  17,  47,  33,  43,  17,   2],
        [ 48,  15,   9,  17,  49,  50,  16,  28],
        [ 37,  51,  30,  52,  53,  23,  54,  55],
        [ 13,  17,  56,  57,  58,  22,  17,  59],
        [ 33,  37,  60,  17,  61,  62,  61,  13],
        [ 27,  63,  64,  65,  66,  17,  67,  16],
        [ 68,  69,  17,  70,  71,  72,  73,  74],
        [ 75,  76,  77,  37,  78,  79,  80,  17],
        [ 81,  65,  61,   9,  82,  61,  15,   0],
        [ 83,  33,  84,  85,  43,  86,  13,  87],
        [ 88,  27,  89,  90,  16,  17,  91,  92],
        [ 93,   2,   3,  94,  15,  95,  46,  96],


In [11]:
print(target)

tensor([  5,  11,  17,  23,  27,  35,  15,  46,  48,  37,  13,  33,  27,  68,
         75,  81,  83,  88,  93,  17,  99,  17, 107, 113,   2,  89,  83, 133,
        138, 113,  13, 147,  15, 147, 159], device='cuda:0')


# Building the model

In [12]:
# creating our FNN model 

# Trigram Neural Network Model
class FNNModel(nn.Module):
    # Here context_size should be 8( because its 8 gram model ), embedding dimension is 200, h is number of hidden layers , can set it to 200
    def __init__(self, vocab_size, embedding_dim, context_size, h):
        super(FNNModel, self).__init__()
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, h)
        # Linear 2 is the decoder that returns a variable based on vocab size 
        self.linear2 = nn.Linear(h, vocab_size, bias = False)

    def forward(self, inputs):
        # compute x': concatenation of all the 8 words in 8 gram model 
        embeds = self.embeddings(inputs).view((-1,self.context_size * self.embedding_dim))
        # compute h: tanh(W_1.x' + b)
        out = torch.tanh(self.linear1(embeds))
        # compute W_2.h
        out = self.linear2(out)
        # compute y: log_softmax(W_2.h)
        log_probs = torch.nn.functional.log_softmax(out, dim=1)
        # return log probabilities
        # BATCH_SIZE x len(vocab)
        return log_probs

# Declaring the helper functions  

In [13]:
# helper function to get accuracy from log probabilities
def get_accuracy_from_log_probs(log_probs, labels):
    probs = torch.exp(log_probs)
    predicted_label = torch.argmax(probs, dim=1)
    acc = (predicted_label == labels).float().mean()
    return acc



# helper function to evaluate model on dev data
def evaluate(model, criterion, dataloader, gpu):
    model.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        dev_st = time.time()
        for it, data_tensor in enumerate(dataloader):
            context_tensor = data_tensor[:,0:2]
            target_tensor = data_tensor[:,2]
            context_tensor, target_tensor = context_tensor.cuda(gpu), target_tensor.cuda(gpu)
            log_probs = model(context_tensor)
            mean_loss += criterion(log_probs, target_tensor).item()
            mean_acc += get_accuracy_from_log_probs(log_probs, target_tensor)
            count += 1
            if it % 500 == 0: 
                print("Dev Iteration {} complete. Mean Loss: {}; Mean Acc:{}; Time taken (s): {}".format(it, mean_loss / count, mean_acc / count, (time.time()-dev_st)))
                dev_st = time.time()

    return mean_acc / count, mean_loss / count


# Training the model

In [15]:
# Using negative log-likelihood loss
loss_function = nn.NLLLoss()

# create model
ntokens = len(corpus.dictionary)
model = FNNModel(ntokens, args.emsize, args.context_size, args.nhid).to(device)

# # load it to gpu
# model.cuda(gpu)

# using ADAM optimizer
optimizer = optim.Adam(model.parameters(), lr = 2e-3)


# ------------------------- TRAIN & SAVE MODEL ------------------------
best_acc = 0
best_model_path = None
for epoch in range(5):
    st = time.time()
    print("\n--- Training model Epoch: {} ---".format(epoch+1))
    for it, data_tensor in enumerate(range(0, train_data.size(0) - 1, args.bptt)):       
        # context_tensor = data_tensor[:,0:2]
        # target_tensor = data_tensor[:,2]
        context_tensor, target_tensor= get_batch(train_data, data_tensor)

        context_tensor, target_tensor = context_tensor.to(device), target_tensor.to(device)

        # zero out the gradients from the old instance
        model.zero_grad()

        # get log probabilities over next words
        log_probs = model(context_tensor)

        # calculate current accuracy
        acc = get_accuracy_from_log_probs(log_probs, target_tensor)

        # compute loss function
        loss = loss_function(log_probs, target_tensor)

        # backward pass and update gradient
        loss.backward()
        optimizer.step()

        if it % 500 == 0: 
            print("Training Iteration {} of epoch {} complete. Loss: {}; Acc:{}; Time taken (s): {}".format(it, epoch, loss.item(), acc, (time.time()-st)))
            st = time.time()

    print("\n--- Evaluating model on dev data ---")
    dev_acc, dev_loss = evaluate(model, loss_function, dev_loader, gpu)
    print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(epoch, dev_acc, dev_loss))
    if dev_acc > best_acc:
        print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
        best_acc = dev_acc
        # set best model path
        best_model_path = 'best_model_{}.dat'.format(epoch)
        # saving best model
        torch.save(model.state_dict(), best_model_path)


--- Training model Epoch: 1 ---
Training Iteration 0 of epoch 0 complete. Loss: 10.429022789001465; Acc:0.0; Time taken (s): 1.6938209533691406
Training Iteration 500 of epoch 0 complete. Loss: 8.245803833007812; Acc:0.02857142873108387; Time taken (s): 9.499763011932373
Training Iteration 1000 of epoch 0 complete. Loss: 7.6052656173706055; Acc:0.1428571492433548; Time taken (s): 9.596279382705688
Training Iteration 1500 of epoch 0 complete. Loss: 7.178322792053223; Acc:0.05714285746216774; Time taken (s): 9.51860499382019
Training Iteration 2000 of epoch 0 complete. Loss: 7.172187328338623; Acc:0.1428571492433548; Time taken (s): 9.695068120956421
Training Iteration 2500 of epoch 0 complete. Loss: 7.939382553100586; Acc:0.08571428805589676; Time taken (s): 9.542513847351074
Training Iteration 3000 of epoch 0 complete. Loss: 8.121339797973633; Acc:0.20000000298023224; Time taken (s): 9.536884307861328


KeyboardInterrupt: 