In [1]:
%matplotlib inline

import numpy as np
from matplotlib import pyplot as plt
import time
import os
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torchsummaryX import summary
from tests_hw4 import test_prediction, test_generation

In [2]:
# TODO: define other hyperparameters here
NUM_EPOCHS = 5
BATCH_SIZE = 2
SEQ_LEN = 10
EMB_DIM = 10
HIDDEN_SIZE = 10
LR = 0.001

In [3]:
# load all that we need

dataset = np.load('../dataset/wiki.train.npy', allow_pickle=True)
devset = np.load('../dataset/wiki.valid.npy', allow_pickle=True)
fixtures_pred = np.load('../fixtures/prediction.npz')  # dev
fixtures_gen = np.load('../fixtures/generation.npy')  # dev
fixtures_pred_test = np.load('../fixtures/prediction_test.npz')  # test
fixtures_gen_test = np.load('../fixtures/generation_test.npy')  # test
vocab = np.load('../dataset/vocab.npy')

In [38]:
# data loader

class DataLoaderForLanguageModeling(DataLoader):
    """
        TODO: Define data loader logic here
    """
    def __init__(self, dataset, batch_size, seq_len, shuffle=True):
        self.dataset = dataset
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.shuffle = shuffle


    def __iter__(self):
        """
            You may implement some of the techniques in https://arxiv.org/pdf/1708.02182.pdf
            example: Variable length backpropagation sequences (Section 4.1)
        """
        ## dataset = Array of articles; article = array of ints
        # 1. Randomly shuffle all the articles from the WikiText-2 dataset.
        if(self.shuffle):
            np.random.shuffle(self.dataset)
        # 2. Concatenate all text in one long string.
        data = np.concatenate(self.dataset)
        # 3. Group the sequences into batches.
        num_batches = len(data - 1) // self.batch_size # One less since need offset for label
        inputs  = data[0:num_batches * self.batch_size].reshape(self.batch_size,-1)
        targets = data[1:num_batches * self.batch_size + 1].reshape(self.batch_size,-1)
        inputs = torch.from_numpy(inputs).to(dtype=torch.long)
        targets = torch.from_numpy(targets).to(dtype=torch.long)
        # 4. Run a loop that returns a tuple of (input, label) on every iteration with yield.
        offset = 0
        while(offset + self.seq_len < num_batches * self.batch_size):
            input_ = inputs[:, offset : offset+self.seq_len]
            target = targets[:, offset : offset+self.seq_len]
            offset += self.seq_len
            yield (input_, target)

# # TEST       
# test = DataLoaderForLanguageModeling(dataset, BATCH_SIZE, SEQ_LEN)
# for i,(test_inputs, test_targets) in enumerate(test.__iter__()):
#     print('---------')
#     print('iter: ', i)
#     print('shape: ', test_inputs.shape)
#     print('type: ', test_inputs.dtype, ', ', test_targets.dtype)
#     for batch_idx in range(0, test.batch_size):
#         tmpstr1 = ['    ']
#         tmpstr2 = ['    ']
#         for seq_idx in range(0, test.seq_len):
#             tmpstr1.append(vocab[test_inputs[batch_idx, seq_idx]])
#             tmpstr2.append(vocab[test_targets[batch_idx, seq_idx]])
#         print(' '.join(tmpstr1))
#         print(' '.join(tmpstr2))
#         print()
#     if(i > 3):
#         break

In [63]:
# model

class Model(nn.Module):
    """
        TODO: Define your model here
    """
    def __init__(self, vocab_size:int, embedding_dim:int, hidden_size:int):
        super(Model, self).__init__()
        # Embedding: vocab_size -> embedding_dim
        # LSTM: embedding_dim -> hidden_size
        # Classifier: hidden_size -> vocab_size
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            input_size = embedding_dim,
            hidden_size = hidden_size,
            num_layers = 1,
            bidirectional = True,
            batch_first=True)
        self.classifier = nn.Sequential(
            torch.nn.Linear(hidden_size * 2, vocab_size),
        )


    def forward(self, x, h_in = None):
        # Feel free to add extra arguments to forward (like an argument to pass in the hiddens)
        out = self.embedding(x)
        out, h_out = self.lstm(out, h_in) if h_in else self.lstm(out)
        out = self.classifier(out)
        return out, h_out

# TEST
model = Model(len(vocab), EMB_DIM, HIDDEN_SIZE)
test_input = torch.randint(0, len(vocab), (BATCH_SIZE, SEQ_LEN), dtype=torch.long)
test_output, test_hidden = model(test_input)
print('Input : ', test_input.shape, ', ', test_input.dtype)
print('Output: ', test_output.shape, ', ', test_output.dtype) # (batch, seq_len, vocab_size)
summary(model, test_input)

Input :  torch.Size([2, 10]) ,  torch.int64
Output:  torch.Size([2, 10, 33278]) ,  torch.float32
                      Kernel Shape    Output Shape  Params  Mult-Adds
Layer                                                                
0_embedding            [10, 33278]     [2, 10, 10]  332780     332780
1_lstm                           -     [2, 10, 20]    1760       1600
2_classifier.Linear_0  [20, 33278]  [2, 10, 33278]  698838     665560
---------------------------------------------------------------------
                       Totals
Total params          1033378
Trainable params      1033378
Non-trainable params        0
Mult-Adds              999940


  df_sum = df.sum()


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_embedding,"[10, 33278]","[2, 10, 10]",332780,332780
1_lstm,-,"[2, 10, 20]",1760,1600
2_classifier.Linear_0,"[20, 33278]","[2, 10, 33278]",698838,665560


In [64]:
# model trainer

class Trainer:
    def __init__(self, model, loader, max_epochs=1, run_id='exp'):
        """
            Use this class to train your model
        """
        # feel free to add any other parameters here
        self.model = model
        self.loader = loader
        self.train_losses = []
        self.val_losses = []
        self.predictions = []
        self.predictions_test = []
        self.generated_logits = []
        self.generated = []
        self.generated_logits_test = []
        self.generated_test = []
        self.epochs = 0
        self.max_epochs = max_epochs
        self.run_id = run_id
        
        # TODO: Define your optimizer and criterion here
        # feel free to define a learning rate scheduler as well if you want
        self.optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
        self.criterion = nn.CrossEntropyLoss() # Correct???

    def train(self):
        self.model.train() # set to training mode
        epoch_loss = 0
        num_batches = 0
        for batch_num, (inputs, targets) in enumerate(self.loader):
            epoch_loss += self.train_batch(inputs, targets)
        epoch_loss = epoch_loss / (batch_num + 1)
        print('[TRAIN]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs + 1, self.max_epochs, epoch_loss))
        self.train_losses.append(epoch_loss)

    def train_batch(self, inputs, targets):
        """ 
            TODO: Define code for training a single batch of inputs
            
            :return 
                    (float) loss value
        """
        self.optimizer.zero_grad()
        # Forwards
        outputs, _ = self.model(inputs)
        # Compute loss
        loss = self.criterion(
            outputs.reshape(-1, outputs.shape[2]), # (instances, classes)
            targets.reshape(-1) # (instances, )
        )
        # Backwards
        loss.backward()
        self.optimizer.step()
        return loss

    
    def test(self):
        # don't change these
        self.model.eval() # set to eval mode
        predictions = TestLanguageModel.predict(fixtures_pred['inp'], self.model) # get predictions
        self.predictions.append(predictions)
        generated_logits = TestLanguageModel.generate(fixtures_gen, 10, self.model) # generated predictions for 10 words
        generated_logits_test = TestLanguageModel.generate(fixtures_gen_test, 10, self.model)
        nll = test_prediction(predictions, fixtures_pred['out'])
        generated = test_generation(fixtures_gen, generated_logits, vocab)
        generated_test = test_generation(fixtures_gen_test, generated_logits_test, vocab)
        self.val_losses.append(nll)
        
        self.generated.append(generated)
        self.generated_test.append(generated_test)
        self.generated_logits.append(generated_logits)
        self.generated_logits_test.append(generated_logits_test)
        
        # generate predictions for test data
        predictions_test = TestLanguageModel.predict(fixtures_pred_test['inp'], self.model) # get predictions
        self.predictions_test.append(predictions_test)
            
        print('[VAL]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs + 1, self.max_epochs, nll))
        self.epochs += 1

        return nll

    def save(self):
        # don't change these
        model_path = os.path.join('experiments', self.run_id, 'model-{}.pkl'.format(self.epochs))
        torch.save({'state_dict': self.model.state_dict()},
            model_path)
        np.save(os.path.join('experiments', self.run_id, 'predictions-{}.npy'.format(self.epochs)), self.predictions[-1])
        np.save(os.path.join('experiments', self.run_id, 'predictions-test-{}.npy'.format(self.epochs)), self.predictions_test[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-{}.npy'.format(self.epochs)), self.generated_logits[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-test-{}.npy'.format(self.epochs)), self.generated_logits_test[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated_test[-1])


In [65]:
class TestLanguageModel:
    def predict(inp, model):
        """
            TODO: write prediction code here
            
            :param inp:
            :return: a np.ndarray of logits
        """
        outputs, _ = model(inp)
        predictions = outputs[:,:,-1]
        return predictions

        
    def generate(inp, forward, model):
        """
            TODO: write generation code here

            Generate a sequence of words given a starting sequence.
            :param inp: Initial sequence of words (batch size, length)
            :param forward: number of additional words to generate
            :return: generated words (batch size, forward)
        """        
        new_words = []
        hidden = None
        cur_inp = torch.clone(inp)
        for i in range(0, forward):
          out, hidden = model(cur_inp, hidden)
          cur_new_words = torch.argmax(out, dim=2)[:,-1] # Only grab last word per sequence for each batch
          new_words.append(cur_new_words)
          cur_inp = torch.unsqueeze(cur_new_words, dim=1) # (batch,) -> (batch,seq)
        new_words = torch.stack(new_words, dim=1) # (batch, forward)
        return new_words

# # TEST
# test_input = torch.randint(0, len(vocab), (BATCH_SIZE, SEQ_LEN))
# test_model = Model(len(vocab), EMB_DIM, HIDDEN_SIZE)
# test_output = TestLanguageModel.predict(test_input, test_model)
# print('Test predict : ', test_output.shape)
# test_output = TestLanguageModel.generate(test_input, 20, test_model)
# print('Test generate: ', test_output.shape)

In [66]:
run_id = str(int(time.time()))
if not os.path.exists('./experiments'):
    os.mkdir('./experiments')
os.mkdir('./experiments/%s' % run_id)
print("Saving models, predictions, and generated words to ./experiments/%s" % run_id)

Saving models, predictions, and generated words to ./experiments/1670018396


In [67]:
model = Model(len(vocab), embedding_dim=EMB_DIM, hidden_size=HIDDEN_SIZE)

loader = DataLoaderForLanguageModeling(
    dataset=dataset, 
    batch_size=BATCH_SIZE,
    seq_len=SEQ_LEN,
    shuffle=True
)
trainer = Trainer(
    model=model, 
    loader=loader, 
    max_epochs=NUM_EPOCHS, 
    run_id=run_id
)

In [69]:
best_nll = 1e30 
for epoch in range(NUM_EPOCHS):
    print('Epoch: ', epoch+1, '/', NUM_EPOCHS)
    trainer.train()
    nll = trainer.test()
    if nll < best_nll:
        best_nll = nll
        print("Saving model, predictions and generated output for epoch "+str(epoch)+" with NLL: "+ str(best_nll))
        trainer.save()
    

Epoch:  1 / 5


In [None]:
# Don't change these
# plot training curves
plt.figure()
plt.plot(range(1, trainer.epochs + 1), trainer.train_losses, label='Training losses')
plt.plot(range(1, trainer.epochs + 1), trainer.val_losses, label='Validation losses')
plt.xlabel('Epochs')
plt.ylabel('NLL')
plt.legend()
plt.show()

In [None]:
# see generated output
print (trainer.generated[-1]) # get last generated output