In [130]:
%matplotlib inline

import numpy as np
from matplotlib import pyplot as plt
import time
import os
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from tests import test_prediction, test_generation
from functools import wraps
from tqdm import tqdm_notebook as tqdm

In [10]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cpu'

In [3]:
# load all that we need

dataset = np.load('../dataset/wiki.train.npy',allow_pickle=True)
fixtures_pred = np.load('../fixtures/prediction.npz',allow_pickle=True)  # dev
fixtures_gen = np.load('../fixtures/generation.npy',allow_pickle=True)  # dev
fixtures_pred_test = np.load('../fixtures/prediction_test.npz',allow_pickle=True)  # test
fixtures_gen_test = np.load('../fixtures/generation_test.npy',allow_pickle=True)  # test
vocab = np.load('../dataset/vocab.npy',allow_pickle=True)

In [9]:
dataset[:3]

array([array([ 1420, 13859,  3714, ...,   813,    79,  1417], dtype=int32),
       array([ 1420, 13463,  3117, ...,  8635,    79,  1417], dtype=int32),
       array([1420, 1419, 8924, ...,   76,  743, 1417], dtype=int32)],
      dtype=object)

In [187]:
# data loader
#1. Randomly shuffe all the articles from the WikiText-2 dataset.
#2. Concatenate all text in one long string.
#3. Run a loop that returns a tuple of (input, label) on every iteration with yield. (look at iterators
#in python if this sounds unfamiliar)

class LanguageModelDataLoader(DataLoader):
    """
        TODO: Define data loader logic here
    """
    def __init__(self, dataset, batch_size, seq_len, shuffle=True):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seq_len = seq_len + 1

    def __iter__(self):
        if self.shuffle:
            np.random.shuffle(self.dataset)
        data = np.concatenate(self.dataset, axis=0)
        print (len(data)//(self.seq_len * self.batch_size))
        n_seq = len(data) // self.seq_len
        data = data[:n_seq * self.seq_len]
        data =  data.reshape(n_seq, self.seq_len)#torch.tensor(data).view(-1,self.seq_len)
        
        for i in range(0, len(data), self.batch_size):
            if i + self.batch_size >= len(data):
                break
            x = torch.tensor([l[:-1] for l in data[i:i+self.batch_size]]).t().to(DEVICE)
            y = torch.tensor([l[1:] for l in data[i:i+self.batch_size]]).t().to(DEVICE)
            yield x.long(), y.long()

In [188]:
example_dataset = np.array([[0],[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11]])
loader = LanguageModelDataLoader(dataset=example_dataset, batch_size=2, seq_len=3, shuffle=False)
for x, y in loader:
    print (x)
    print (y)
    
#Inputs:

#tensor([[0, 4],
#        [1, 5],
#        [2, 6]], device='cuda:0')
#Labels:
#tensor([[1, 5],
#        [2, 6],
#        [3, 7]], device='cuda:0')

tensor([[0, 4],
        [1, 5],
        [2, 6]])
tensor([[1, 5],
        [2, 6],
        [3, 7]])


In [189]:
#https://github.com/salesforce/awd-lstm-lm/blob/master/model.py
    
class LockedDropout(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x, dropout=0.5):
        if not self.training or not dropout:
            return x
        m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout)
        mask = Variable(m, requires_grad=False) / (1 - dropout)
        mask = mask.expand_as(x)
        return mask * x
    
    

# https://github.com/salesforce/awd-lstm-lm/blob/master/weight_drop.py
class WeightDrop(torch.nn.Module):
    def __init__(self, module, weights, dropout=0, variational=False):
        super(WeightDrop, self).__init__()
        self.module = module
        self.weights = weights
        self.dropout = dropout
        self.variational = variational
        self._setup()

    def _setup(self):
        # Terrible temporary solution to an issue regarding compacting weights re: CUDNN RNN
        if issubclass(type(self.module), torch.nn.RNNBase):
            self.module.flatten_parameters = self.widget_demagnetizer_y2k_edition

        for name_w in self.weights:
            print('Applying weight drop of {} to {}'.format(self.dropout, name_w))
            w = getattr(self.module, name_w)
            del self.module._parameters[name_w]
            self.module.register_parameter(name_w + '_raw', nn.Parameter(w.data))

    def _setweights(self):
        for name_w in self.weights:
            raw_w = getattr(self.module, name_w + '_raw')
            w = None
            if self.variational:
                mask = torch.autograd.Variable(torch.ones(raw_w.size(0), 1))
                if raw_w.is_cuda: mask = mask.cuda()
                mask = torch.nn.functional.dropout(mask, p=self.dropout, training=True)
                w = mask.expand_as(raw_w) * raw_w
            else:
                w = torch.nn.functional.dropout(raw_w, p=self.dropout, training=self.training)
            setattr(self.module, name_w, w)

    def forward(self, *args):
        self._setweights()
        return self.module.forward(*args)
    
def embedding_dropout(embed, x, dropout=0.1):
    mask = embed.weight.data.new().resize_((embed.weight.size(0), 1)).bernoulli_(1 - dropout).expand_as(embed.weight) / (1 - dropout)
    masked_embed_weight = mask * embed.weight
    x = nn.functional.embedding(x, masked_embed_weight)
    return x

In [211]:
# model

# Apply locked dropout between LSTM layers
# Apply embedding dropout
# Apply weight decay
# Tie the weights of the embedding and the output layer
# Activity regularization
# Temporal activity regularization

class LanguageModel(nn.Module):
    """
        TODO: Define your model here
    """
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(LanguageModel, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.lockdrop = LockedDropout()
        self.idrop = nn.Dropout(0.4)
        self.encoder = nn.Embedding(vocab_size, embed_size)
        self.rnns = [
            torch.nn.LSTM(embed_size, hidden_size, 1),
            torch.nn.LSTM(hidden_size, hidden_size, 1),
            torch.nn.LSTM(hidden_size, embed_size, 1),
        ]
        # self.rnns = [WeightDrop(rnn, ['weight_hh_l0'], dropout= 0.5) for rnn in self.rnns]
        self.rnns = torch.nn.ModuleList(self.rnns)
        self.decoder = nn.Linear(embed_size, vocab_size)
        self.decoder.weight = self.encoder.weight
        self.hidden = None

        #self.init_weights()

    def forward(self, x, hidden = None):
        # x: L x N
        hidden = self.hidden
        new_hiddens = []
        batch_size = x.size(1) 
        x = embedding_dropout(self.encoder, x, dropout=0.1) #L x N x E
        for l, rnn in enumerate(self.rnns):
            x, hidden = rnn(x, hidden) #L x N x H
        x_flatten = x.view(-1,self.embed_size) #(L*N) x E
        output_flatten = self.decoder(x_flatten) #(L*N) x V
        return output_flatten.view(-1, batch_size,self.vocab_size)



In [218]:
# model trainer

class LanguageModelTrainer:
    def __init__(self, model, loader, max_epochs=1, run_id='exp'):
        """
            Use this class to train your model
        """
        # feel free to add any other parameters here
        self.model = model
        self.loader = loader
        self.train_losses = []
        self.val_losses = []
        self.predictions = []
        self.predictions_test = []
        self.generated_logits = []
        self.generated = []
        self.generated_logits_test = []
        self.generated_test = []
        self.epochs = 0
        self.max_epochs = max_epochs
        self.run_id = run_id
        
        # TODO: Define your optimizer and criterion here
        self.optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-5)
        self.criterion = nn.NLLLoss()

    def train(self):
        self.model.train() # set to training mode
        epoch_loss = 0
        num_batches = 0
        for batch_num, (inputs, targets) in tqdm(enumerate(self.loader)):
            epoch_loss += self.train_batch(inputs, targets)
        epoch_loss = epoch_loss / (batch_num + 1)
        self.epochs += 1
        print('[TRAIN]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs + 1, self.max_epochs, epoch_loss))
        self.train_losses.append(epoch_loss)

    def train_batch(self, inputs, targets):
        """ 
            TODO: Define code for training a single batch of inputs
        
        """
        self.optimizer.zero_grad()
        inputs = inputs.to(DEVICE)
        targets = targets.to(DEVICE)
        outputs = model(inputs) # 3D
        loss = self.criterion(outputs.view(-1,outputs.size(2)),targets.view(-1)) # Loss of the flattened outputs
        loss.backward()
        self.optimizer.step()
        return loss.item()

    
    def test(self):
        # don't change these
        self.model.eval() # set to eval mode
        predictions = TestLanguageModel.prediction(fixtures_pred['inp'], self.model) # get predictions
        self.predictions.append(predictions)
        generated_logits = TestLanguageModel.generation(fixtures_gen, 10, self.model) # generated predictions for 10 words
        generated_logits_test = TestLanguageModel.generation(fixtures_gen_test, 10, self.model)
        nll = test_prediction(predictions, fixtures_pred['out'])
        generated = test_generation(fixtures_gen, generated_logits, vocab)
        generated_test = test_generation(fixtures_gen_test, generated_logits_test, vocab)
        self.val_losses.append(nll)
        
        self.generated.append(generated)
        self.generated_test.append(generated_test)
        self.generated_logits.append(generated_logits)
        self.generated_logits_test.append(generated_logits_test)
        
        # generate predictions for test data
        predictions_test = TestLanguageModel.prediction(fixtures_pred_test['inp'], self.model) # get predictions
        self.predictions_test.append(predictions_test)
            
        print('[VAL]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs + 1, self.max_epochs, nll))
        return nll

    def save(self):
        # don't change these
        model_path = os.path.join('experiments', self.run_id, 'model-{}.pkl'.format(self.epochs))
        torch.save({'state_dict': self.model.state_dict()},
            model_path)
        np.save(os.path.join('experiments', self.run_id, 'predictions-{}.npy'.format(self.epochs)), self.predictions[-1])
        np.save(os.path.join('experiments', self.run_id, 'predictions-test-{}.npy'.format(self.epochs)), self.predictions_test[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-{}.npy'.format(self.epochs)), self.generated_logits[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-test-{}.npy'.format(self.epochs)), self.generated_logits_test[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated_test[-1])


In [219]:
class TestLanguageModel:
    def prediction(inp, model):
        """
            TODO: write prediction code here
            
            :param inp:
            :return: a np.ndarray of logits
        """
        model.eval()
        return model(inp)[-1]

        
    def generation(inp, forward, model):
        """
            TODO: write generation code here

            Generate a sequence of words given a starting sequence.
            :param inp: Initial sequence of words (batch size, length)
            :param forward: number of additional words to generate
            :return: generated words (batch size, forward)
        """        
        outputs = []
        for seq in inp:
            generated_words = []
            logits = self.prediction(seq)[-1] # N X V
            _,current_word = torch.max(logits,dim=1) # N x 1
            generated_words.append(current_word)
            if n_words > 1:
                for i in range(n_words-1):
                    embed = self.embedding(current_word).unsqueeze(0) # 1 x 1 x E
                    output_lstm, hidden = self.rnn(embed,hidden) # 1 x 1 x H
                    output = output_lstm[0] # 1 x H
                    scores = self.scoring(output) # V
                    _,current_word = torch.max(scores,dim=1) # 1
                    generated_words.append(current_word)
            outputs.append(torch.cat(generated_words,dim=0))
        return torch.cat(outputs)
        

In [220]:
# TODO: define other hyperparameters here

NUM_EPOCHS = 5
BATCH_SIZE = 32
SEQ_LEN = 40

In [221]:
run_id = str(int(time.time()))
if not os.path.exists('./experiments'):
    os.mkdir('./experiments')
os.mkdir('./experiments/%s' % run_id)
print("Saving models, predictions, and generated words to ./experiments/%s" % run_id)

Saving models, predictions, and generated words to ./experiments/1587858511


In [222]:
model = LanguageModel(len(vocab),200,200)
loader = LanguageModelDataLoader(dataset=dataset, batch_size=BATCH_SIZE, seq_len = SEQ_LEN, shuffle=True)
trainer = LanguageModelTrainer(model=model, loader=loader, max_epochs=NUM_EPOCHS, run_id=run_id)

In [223]:
best_nll = 1e30 
for epoch in range(NUM_EPOCHS):
    trainer.train()
    nll = trainer.test()
    if nll < best_nll:
        best_nll = nll
        print("Saving model, predictions and generated output for epoch "+str(epoch)+" with NLL: "+ str(best_nll))
        trainer.save()
    

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

KeyboardInterrupt: 

In [None]:
# Don't change these
# plot training curves
plt.figure()
plt.plot(range(1, trainer.epochs + 1), trainer.train_losses, label='Training losses')
plt.plot(range(1, trainer.epochs + 1), trainer.val_losses, label='Validation losses')
plt.xlabel('Epochs')
plt.ylabel('NLL')
plt.legend()
plt.show()

In [None]:
# see generated output
print (trainer.generated[-1]) # get last generated output