# Import and declaring certain arguments

In [22]:
# coding: utf-8
import argparse
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx
import data
import model
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import math

In [2]:

class Args:
  data = './data/wikitext-2'
  model = 'FNNModel'
  emsize = 200
  context_size = 8
  nhid = 200
  nlayers = 2
  lr = 20
  clip = 0.25
  epochs = 40
  batch_size = 8
  bptt = 35
  dropout = 0.2
  tied = True
  seed = 1111
  cuda = True
  log_interval = 200
  save = 'model.pt'
  onnx_export = ''
  nhead = 2
  dry_run =  True

args=Args()

In [3]:
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if args.cuda else "cpu")



# Loading the data

In [4]:
###############################################################################
# Load data
###############################################################################

corpus = data.Corpus(args.data)

# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.


In [5]:
torch.set_printoptions(edgeitems=100)
print(corpus.train)

tensor([    0,     1,     2,     3,     4,     1,     0,     0,     5,     6,
            2,     7,     8,     9,     3,    10,    11,     8,    12,    13,
           14,    15,     2,    16,    17,    18,     7,    19,    13,    20,
           21,    22,    23,     2,     3,     4,    24,    25,    13,    26,
           27,    28,    29,    30,    31,    32,    33,    34,    35,    36,
           37,    38,    39,    17,    40,    41,    15,    42,    43,    44,
           45,    43,    25,    13,    46,    26,    17,    47,    33,    43,
           17,     2,    48,    15,     9,    17,    49,    50,    16,    28,
           37,    51,    30,    52,    53,    23,    54,    55,    13,    17,
           56,    57,    58,    22,    17,    59,    33,    37,    60,    17,
         ...,    93,   622,    22,  5002,    78,  6656,  7628,    43,   293,
         1043,    15,  5277,  4726,   284, 23960,    26,   494,   489,   151,
        27535,   348,  4737,    43, 17444,    39,    17,  2532,  

In [6]:
# # we want to return a tensor with ascending batch 

# def batchify(data, bsz):
#     # Work out how cleanly we can divide the dataset into bsz parts.
#     nbatch = data.size(0) // bsz
#     # Trim off any extra elements that wouldn't cleanly fit (remainders).
#     data = data.narrow(0, 0, nbatch * bsz)
#     # Evenly divide the data across the bsz batches.
#     data = data.view(-1, bsz).contiguous()
#     return data.to(device)

# eval_batch_size = 8
# train_data = batchify(corpus.train, args.batch_size)
# val_data = batchify(corpus.valid, eval_batch_size)
# test_data = batchify(corpus.test, eval_batch_size)

In [7]:
def batchify(data, bsz):
    value=[]
    data = data.numpy()
    for i,word in enumerate(data):
        if i+bsz>= len(data):
            # sentence boundary reached
            # ignoring sentence less than 3 words
            break
        # convert word to id
        value1 = []
        for j in range(bsz+1):
            value1.append(data[i+j])
        value.append(value1)
    value = torch.LongTensor(value)
    return value.to(device)

In [8]:
eval_batch_size = 8
train_data = batchify(corpus.train, args.context_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [9]:
torch.set_printoptions(edgeitems=100)
print(train_data)

tensor([[    0,     1,     2,     3,     4,     1,     0,     0,     5],
        [    1,     2,     3,     4,     1,     0,     0,     5,     6],
        [    2,     3,     4,     1,     0,     0,     5,     6,     2],
        [    3,     4,     1,     0,     0,     5,     6,     2,     7],
        [    4,     1,     0,     0,     5,     6,     2,     7,     8],
        [    1,     0,     0,     5,     6,     2,     7,     8,     9],
        [    0,     0,     5,     6,     2,     7,     8,     9,     3],
        [    0,     5,     6,     2,     7,     8,     9,     3,    10],
        [    5,     6,     2,     7,     8,     9,     3,    10,    11],
        [    6,     2,     7,     8,     9,     3,    10,    11,     8],
        [    2,     7,     8,     9,     3,    10,    11,     8,    12],
        [    7,     8,     9,     3,    10,    11,     8,    12,    13],
        [    8,     9,     3,    10,    11,     8,    12,    13,    14],
        [    9,     3,    10,    11,     8,    12, 

# Declaring data loader functions 

In [10]:
# get the train and target for the train values
def get_batch(source, i):
    seq_len = min(args.bptt, len(source) - 1 - i)
    data = source[i:i+seq_len, 0:args.context_size]
    target = source[i+1:i+1+seq_len, args.context_size-1:args.context_size]
    target = target.narrow(1,0,1).contiguous().view(-1)
    return data, target

In [11]:
data, target = get_batch(train_data, 0)

In [12]:
print(data)

tensor([[ 0,  1,  2,  3,  4,  1,  0,  0],
        [ 1,  2,  3,  4,  1,  0,  0,  5],
        [ 2,  3,  4,  1,  0,  0,  5,  6],
        [ 3,  4,  1,  0,  0,  5,  6,  2],
        [ 4,  1,  0,  0,  5,  6,  2,  7],
        [ 1,  0,  0,  5,  6,  2,  7,  8],
        [ 0,  0,  5,  6,  2,  7,  8,  9],
        [ 0,  5,  6,  2,  7,  8,  9,  3],
        [ 5,  6,  2,  7,  8,  9,  3, 10],
        [ 6,  2,  7,  8,  9,  3, 10, 11],
        [ 2,  7,  8,  9,  3, 10, 11,  8],
        [ 7,  8,  9,  3, 10, 11,  8, 12],
        [ 8,  9,  3, 10, 11,  8, 12, 13],
        [ 9,  3, 10, 11,  8, 12, 13, 14],
        [ 3, 10, 11,  8, 12, 13, 14, 15],
        [10, 11,  8, 12, 13, 14, 15,  2],
        [11,  8, 12, 13, 14, 15,  2, 16],
        [ 8, 12, 13, 14, 15,  2, 16, 17],
        [12, 13, 14, 15,  2, 16, 17, 18],
        [13, 14, 15,  2, 16, 17, 18,  7],
        [14, 15,  2, 16, 17, 18,  7, 19],
        [15,  2, 16, 17, 18,  7, 19, 13],
        [ 2, 16, 17, 18,  7, 19, 13, 20],
        [16, 17, 18,  7, 19, 13, 2

In [13]:
print(target)

tensor([ 5,  6,  2,  7,  8,  9,  3, 10, 11,  8, 12, 13, 14, 15,  2, 16, 17, 18,
         7, 19, 13, 20, 21, 22, 23,  2,  3,  4, 24, 25, 13, 26, 27, 28, 29],
       device='cuda:0')


# Building the model

In [14]:
# creating our FNN model 

# Trigram Neural Network Model
class FNNModel(nn.Module):
    # Here context_size should be 8( because its 8 gram model ), embedding dimension is 200, h is number of hidden layers , can set it to 200
    def __init__(self, vocab_size, embedding_dim, context_size, h):
        super(FNNModel, self).__init__()
        self.context_size = context_size
        self.embedding_dim = embedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, h)
        # Linear 2 is the decoder that returns a variable based on vocab size 
        self.linear2 = nn.Linear(h, vocab_size, bias = False)

    def forward(self, inputs):
        # compute x': concatenation of all the 8 words in 8 gram model 
        embeds = self.embeddings(inputs).view((-1,self.context_size * self.embedding_dim))
        # compute h: tanh(W_1.x' + b)
        out = torch.tanh(self.linear1(embeds))
        # compute W_2.h
        out = self.linear2(out)
        # compute y: log_softmax(W_2.h)
        log_probs = torch.nn.functional.log_softmax(out, dim=1)
        # return log probabilities
        # BATCH_SIZE x len(vocab)
        return log_probs

# Declaring the helper functions  

In [15]:
# helper function to get accuracy from log probabilities
def get_accuracy_from_log_probs(log_probs, labels):
    probs = torch.exp(log_probs)
    predicted_label = torch.argmax(probs, dim=1)
    acc = (predicted_label == labels).float().mean()
    return acc



# helper function to evaluate model on dev data
def evaluate(model, criterion, data_source):
    model.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        dev_st = time.time()
        for it, data_tensor in enumerate(range(0, data_source.size(0) - 1, args.bptt)):
            context_tensor, target_tensor = get_batch(data_source, data_tensor)
#             context_tensor = data_tensor[:,0:2]
#             target_tensor = data_tensor[:,2]
            context_tensor, target_tensor = context_tensor.to(device), target_tensor.to(device)
            log_probs = model(context_tensor)
            mean_loss += criterion(log_probs, target_tensor).item()
            mean_acc += get_accuracy_from_log_probs(log_probs, target_tensor)
            count += 1
            if it % 500 == 0: 
                print("Dev Iteration {} complete. Mean Loss: {}; Mean Acc:{}; Time taken (s): {}".format(it, mean_loss / count, mean_acc / count, (time.time()-dev_st)))
                dev_st = time.time()

    return mean_acc / count, mean_loss / count


# Training the model and evaluating on val set

In [26]:
# Using negative log-likelihood loss
loss_function = nn.NLLLoss()

# create model
ntokens = len(corpus.dictionary)
model = FNNModel(ntokens, args.emsize, args.context_size, args.nhid).to(device)

# # load it to gpu
# model.cuda(gpu)

# using ADAM optimizer
optimizer = optim.Adam(model.parameters(), lr = 2e-3)

#define epochs
epochs = 1


# ------------------------- TRAIN & SAVE MODEL ------------------------
best_acc = 0
best_per= 9999999999999999
best_model_path = None
loss_values=[]
ppl_values=[]
for epoch in range(epochs):
    st = time.time()
    running_loss = 0.0
    running_ppl =0.0
    print("\n--- Training model Epoch: {} ---".format(epoch+1))
    for it, data_tensor in enumerate(range(0, train_data.size(0) - 1, args.bptt)):       
        # context_tensor = data_tensor[:,0:2]
        # target_tensor = data_tensor[:,2]
        context_tensor, target_tensor= get_batch(train_data, data_tensor)

        context_tensor, target_tensor = context_tensor.to(device), target_tensor.to(device)

        # zero out the gradients from the old instance
        model.zero_grad()

        # get log probabilities over next words
        log_probs = model(context_tensor)

        # calculate current accuracy
        acc = get_accuracy_from_log_probs(log_probs, target_tensor)

        # compute loss function
        loss = loss_function(log_probs, target_tensor)

        # backward pass and update gradient
        loss.backward()
        optimizer.step()
        running_loss =+ loss.item()
        running_ppl =+ math.exp(loss.item())

        if it % 10000 == 0: 
            print("Training Iteration {} of epoch {} complete. Loss: {}; Acc:{}; Time taken (s): {}| ppl {:8.2f}".format(it, epoch, loss.item(), acc, (time.time()-st),  math.exp(loss.item())))
            st = time.time()
    total_runs = math.floor((train_data.size(0) - 1)/(args.bptt))
    loss_values.append(running_loss / total_runs)
    ppl_values.append(running_ppl / total_runs)
    print("\n--- Evaluating model on dev data ---")
    dev_acc, dev_loss = evaluate(model, loss_function, val_data)
    dev_per = math.exp(dev_loss)
    print("Epoch {} complete! Development Accuracy: {}; Development Loss: {};  Development ppl: {}".format(epoch, dev_acc, dev_loss), dev_per)
    if dev_per < best_per:
        print("Best validation perplexity improved from {} to {}, saving model...".format(best_per, dev_per))
        best_per = dev_per
        with open(args.save, 'wb') as f:
            torch.save(model, f)
        # set best model path


        
#         best_model_path = 'best_model_{}.dat'.format(epoch)
#         # saving best model
#         torch.save(model.state_dict(), best_model_path)


--- Training model Epoch: 1 ---
Training Iteration 0 of epoch 0 complete. Loss: 10.338781356811523; Acc:0.0; Time taken (s): 0.01899886131286621| ppl 30908.34
Training Iteration 500 of epoch 0 complete. Loss: 7.378848552703857; Acc:0.11428571492433548; Time taken (s): 9.862016916275024| ppl  1601.74
Training Iteration 1000 of epoch 0 complete. Loss: 6.444242000579834; Acc:0.02857142873108387; Time taken (s): 9.839999675750732| ppl   629.07
Training Iteration 1500 of epoch 0 complete. Loss: 8.416387557983398; Acc:0.05714285746216774; Time taken (s): 9.959983110427856| ppl  4520.54
Training Iteration 2000 of epoch 0 complete. Loss: 6.931700706481934; Acc:0.05714285746216774; Time taken (s): 10.073001861572266| ppl  1024.23
Training Iteration 2500 of epoch 0 complete. Loss: 6.0366387367248535; Acc:0.11428571492433548; Time taken (s): 10.003998041152954| ppl   418.48
Training Iteration 3000 of epoch 0 complete. Loss: 6.636714458465576; Acc:0.05714285746216774; Time taken (s): 9.9169998168

Training Iteration 29000 of epoch 0 complete. Loss: 7.298385143280029; Acc:0.08571428805589676; Time taken (s): 9.870999813079834| ppl  1477.91
Training Iteration 29500 of epoch 0 complete. Loss: 5.891428470611572; Acc:0.22857142984867096; Time taken (s): 9.862000465393066| ppl   361.92
Training Iteration 30000 of epoch 0 complete. Loss: 4.547460556030273; Acc:0.3142857253551483; Time taken (s): 9.878999471664429| ppl    94.39
Training Iteration 30500 of epoch 0 complete. Loss: 4.9258551597595215; Acc:0.4285714328289032; Time taken (s): 9.80300760269165| ppl   137.81
Training Iteration 31000 of epoch 0 complete. Loss: 5.226817607879639; Acc:0.3142857253551483; Time taken (s): 9.84099531173706| ppl   186.20
Training Iteration 31500 of epoch 0 complete. Loss: 5.9762372970581055; Acc:0.1428571492433548; Time taken (s): 9.905996799468994| ppl   393.96
Training Iteration 32000 of epoch 0 complete. Loss: 5.145524024963379; Acc:0.22857142984867096; Time taken (s): 9.887001037597656| ppl   171

Training Iteration 58000 of epoch 0 complete. Loss: 7.570720672607422; Acc:0.1428571492433548; Time taken (s): 9.798983097076416| ppl  1940.54
Training Iteration 58500 of epoch 0 complete. Loss: 6.528282642364502; Acc:0.11428571492433548; Time taken (s): 9.844014406204224| ppl   684.22
Training Iteration 59000 of epoch 0 complete. Loss: 8.430855751037598; Acc:0.08571428805589676; Time taken (s): 9.84398365020752| ppl  4586.42
Training Iteration 59500 of epoch 0 complete. Loss: 6.518679141998291; Acc:0.1428571492433548; Time taken (s): 9.966999530792236| ppl   677.68

--- Evaluating model on dev data ---
Dev Iteration 0 complete. Mean Loss: 5.513369560241699; Mean Acc:0.20000000298023224; Time taken (s): 0.002000093460083008
Dev Iteration 500 complete. Mean Loss: 6.812665486764051; Mean Acc:0.15118339657783508; Time taken (s): 0.8270001411437988
Dev Iteration 1000 complete. Mean Loss: 6.810358872304072; Mean Acc:0.15190523862838745; Time taken (s): 0.8259828090667725
Dev Iteration 1500 

In [32]:
print(train_data.size(0) - 1)
print(args.bptt)
testa = (train_data.size(0) - 1)/(args.bptt)
print(testa)

261077
35
7459.342857142857


In [28]:
print(loss_values)

[0.0001043031141913544]
