In [0]:
%matplotlib inline

import numpy as np
from matplotlib import pyplot as plt
import time
import os
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from tests import test_prediction, test_generation

In [0]:
# load all that we need

dataset = np.load('/content/sample_data/wiki.train.npy',allow_pickle=True)
fixtures_pred = np.load('/content/fixtures/prediction.npz',allow_pickle=True)  # dev
fixtures_gen = np.load('/content/fixtures/generation.npy',allow_pickle=True)  # dev
fixtures_pred_test = np.load('/content/fixtures/prediction_test.npz',allow_pickle=True)  # test
fixtures_gen_test = np.load('/content/fixtures/generation_test.npy',allow_pickle=True)  # test
vocab = np.load('/content/sample_data/vocab.npy',allow_pickle=True)

In [0]:
# data loader

class LanguageModelDataLoader(DataLoader):
    """
        TODO: Define data loader logic here
    """
    def __init__(self, dataset, batch_size, shuffle=True):
        
        
        self.x = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seq_length = 200

     
        

    def __iter__(self):
        # concatenate your articles and build into batches

        
        if self.shuffle:
            np.random.shuffle(self.x)
        
        x = np.hstack(self.x)
        self.n_seq = len(x) // self.seq_length
        x = x[:self.n_seq * self.seq_length]
        self.data = np.reshape(x,(-1,self.seq_length)) # batch * len
        
      
        for i in range(0, len(self.data), self.batch_size):
            temp = self.data[i:i+self.batch_size]
            #yield (torch.Tensor(np.transpose(temp[:,:-1])).long(), torch.Tensor(np.transpose(temp[:,1:])).long()) # transpose : len * batch
            yield (torch.LongTensor(temp[:,:-1]), torch.LongTensor(temp[:,1:])) # transpose : len * batch
        
   
       
        

        
        

In [0]:
# model

class LanguageModel(nn.Module):
    """
        TODO: Define your model here
    """
    
    def __init__(self, vocab_size):#, embed_size,hidden_size, nlayers):
        super(LanguageModel, self).__init__()
        embed_size = 200
        hidden_size = 200
        nlayers = 3
        
        self.vocab_size=vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.nlayers=nlayers
        self.embedding = nn.Embedding(vocab_size,embed_size) # Embedding layer
        self.rnn = nn.LSTM(input_size = embed_size,hidden_size=hidden_size,num_layers=nlayers, batch_first=True, dropout=0.4) # Recurrent network
        # You can also try GRUs instead of LSTMs.
        
        self.scoring = nn.Linear(hidden_size,vocab_size) # Projection layer

        self.embed_dropout = nn.Dropout(p=0.1)
        self.final_lstm_dropout = nn.Dropout(p=0.3)


    def forward(self, x):
        # Feel free to add extra arguments to forward (like an argument to pass in the hiddens)
        # returns 3D logits
        batch_size = x.size(0) # x: len * batch

        embed = self.embedding(x) #embed: len * batch * embed
        #hidden = None
        embed = self.embed_dropout(embed)
        output_lstm, hidden = self.rnn(embed)
        #output_lstm,hidden = self.rnn(embed,hidden) #out: len * batch * hidden
        #print("OUT", output_lstm.shape)
        #output_lstm_flatten = output_lstm.contiguous().view(-1,self.hidden_size)# (len * batch) * hidden 
        output_lstm = self.final_lstm_dropout(output_lstm)
        output_flatten = self.scoring(output_lstm)# (len * batch) * vocab 
        #print("test flatten:", output_flatten.shape)
        return output_flatten #output_flatten.view(batch_size,-1,self.vocab_size)# len * batch * vocab

    


In [0]:
# model trainer

class LanguageModelTrainer:
    def __init__(self, model, loader, max_epochs=1, run_id='exp'):
        """
            Use this class to train your model
        """
        # feel free to add any other parameters here
        self.model = model
        self.loader = loader
        self.train_losses = []
        self.val_losses = []
        self.predictions = []
        self.predictions_test = []
        self.generated_logits = []
        self.generated = []
        self.generated_logits_test = []
        self.generated_test = []
        self.epochs = 0
        self.max_epochs = max_epochs
        self.run_id = run_id
        
        # TODO: Define your optimizer and criterion here
        self.optimizer = torch.optim.SGD(model.parameters(), lr=8, weight_decay=1e-6)#, momentum=0.9)#torch.optim.Adam(model.parameters(),lr=0.001, weight_decay=1e-6)
        #self.optimizer = torch.optim.ASGD(model.parameters(),lr=4, weight_decay=1e-6)#, lr=0.01, lambd=0.0001, alpha=0.75, t0=1000000.0, weight_decay=0)#torch.optim.Adam(model.parameters(),lr=0.01, weight_decay=1e-6)
        
        self.criterion = nn.CrossEntropyLoss()#nn.NLLLoss()
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min') #torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=1, gamma=0.9)

    def train(self):
        self.model.train() # set to training mode
        epoch_loss = 0
        num_batches = 0
        for batch_num, (inputs, targets) in enumerate(self.loader):
            epoch_loss += self.train_batch(inputs, targets)
        epoch_loss = epoch_loss / (batch_num + 1)
        self.epochs += 1
        print('[TRAIN]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs + 1, self.max_epochs, epoch_loss))
        self.train_losses.append(epoch_loss)

    def train_batch(self, inputs, targets):
        """ 
            TODO: Define code for training a single batch of inputs
        
        """
        
        inputs = inputs.to(device)

        targets = targets.to(device)      

        outputs = self.model(inputs)
 
        loss = self.criterion(outputs.view(-1,outputs.size(2)),targets.contiguous().view(-1))
        self.optimizer.zero_grad() 
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.25)
        self.optimizer.step()
        torch.cuda.empty_cache()
        del targets
        del outputs

        return loss.item()

    
    def test(self):
        # don't change these
        self.model.eval() # set to eval mode
        predictions = TestLanguageModel.prediction(fixtures_pred['inp'], self.model) # get predictions
        self.predictions.append(predictions)
        generated_logits = TestLanguageModel.generation(fixtures_gen, 10, self.model) # generated predictions for 10 words
        generated_logits_test = TestLanguageModel.generation(fixtures_gen_test, 10, self.model)
        nll = test_prediction(predictions, fixtures_pred['out'])
        generated = test_generation(fixtures_gen, generated_logits, vocab)
        generated_test = test_generation(fixtures_gen_test, generated_logits_test, vocab)
        self.val_losses.append(nll)
        
        self.generated.append(generated)
        self.generated_test.append(generated_test)
        self.generated_logits.append(generated_logits)
        self.generated_logits_test.append(generated_logits_test)
        
        # generate predictions for test data
        predictions_test = TestLanguageModel.prediction(fixtures_pred_test['inp'], self.model) # get predictions
        self.predictions_test.append(predictions_test)
            
        print('[VAL]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs + 1, self.max_epochs, nll))
        return nll

    def save(self):
        # don't change these
        model_path = os.path.join('experiments', self.run_id, 'model-{}.pkl'.format(self.epochs))
        torch.save({'state_dict': self.model.state_dict()},
            model_path)
        np.save(os.path.join('experiments', self.run_id, 'predictions-{}.npy'.format(self.epochs)), self.predictions[-1])
        np.save(os.path.join('experiments', self.run_id, 'predictions-test-{}.npy'.format(self.epochs)), self.predictions_test[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-{}.npy'.format(self.epochs)), self.generated_logits[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-test-{}.npy'.format(self.epochs)), self.generated_logits_test[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated_test[-1])


In [0]:
class TestLanguageModel:
    def prediction(inp, model):
        """
            TODO: write prediction code here
            
            :param inp:
            :return: a np.ndarray of logits
        """
        inp = torch.LongTensor(inp)
        inp = inp.to(device)
        model.to(device)

        outputs = model(inp)[:,-1,:]
        del inp
        return outputs.cpu().data.numpy()#.squeeze(0)


        
    def generation(inp, forward, model):
        """
            TODO: write generation code here

            Generate a sequence of words given a starting sequence.
            :param inp: Initial sequence of words (batch size, length)
            :param forward: number of additional words to generate
            :return: generated words (batch size, forward)
        """        
        generated_words = []
        #print(inp.shape)
        batch_size = inp.shape[0]
        inp = torch.LongTensor(inp)# inp = torch.Tensor(np.transpose(inp)).long() # transpose: len * batch
        model.to(device)
        inp = inp.to(device)
        

        embed = model.embedding(inp) #embed: len * batch * embed
        output_lstm,hidden = model.rnn(embed) #len * batch * hidden
  
        #get only the last index
        scores = model.scoring(output_lstm[:,-1,:]) #(1 * batch) * vocab
 
   
        del inp
        #print(outputs.shape, outputs[:,-1,:].shape)
        _,current_word = torch.max(scores,dim=1) # batch
        #print(torch.max(outputs[:,-1,:],dim=1))
        generated_words.append(current_word.cpu().data.numpy())
        print("Current: ", current_word.shape,current_word[0])
        if forward > 1:
            for i in range(forward-1):
                embed = model.embedding(current_word).unsqueeze(1)# 
                #print("EMBED: ",embed.shape, hidden[0].shape)
                output_lstm,hidden = model.rnn(embed,hidden)
                #print("out LSTM: ",output_lstm.shape)
                output = output_lstm[:,-1,:] # 1 * hidden
                scores = model.scoring(output) # vocan
    
                _,current_word = torch.max(scores,dim=1) # 1

                generated_curr = current_word
                generated_words.append(generated_curr.cpu().data.numpy())
                generated_curr.to(device)

        return np.array(generated_words).transpose(1,0)
        
        

In [0]:
# TODO: define other hyperparameters here

NUM_EPOCHS = 50
BATCH_SIZE = 32
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")


In [43]:
run_id = str(int(time.time()))
if not os.path.exists('./experiments'):
    os.mkdir('./experiments')
os.mkdir('./experiments/%s' % run_id)
print("Saving models, predictions, and generated words to ./experiments/%s" % run_id)

Saving models, predictions, and generated words to ./experiments/1588990910


In [0]:
model = LanguageModel(len(vocab))
model = model.to(device)
loader = LanguageModelDataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
trainer = LanguageModelTrainer(model=model, loader=loader, max_epochs=NUM_EPOCHS, run_id=run_id)

In [0]:
best_nll = 1e30 
for epoch in range(NUM_EPOCHS):
    trainer.train()
    nll = trainer.test()
    if nll < best_nll:
        best_nll = nll
        print("Saving model, predictions and generated output for epoch "+str(epoch)+" with NLL: "+ str(best_nll))
        trainer.save()
    trainer.scheduler.step(nll)
    
      
    

[TRAIN]  Epoch [2/50]   Loss: 7.5368
Current:  torch.Size([32]) tensor(14658, device='cuda:0')
Current:  torch.Size([128]) tensor(14658, device='cuda:0')
[VAL]  Epoch [2/50]   Loss: 6.6584
Saving model, predictions and generated output for epoch 0 with NLL: 6.658377
[TRAIN]  Epoch [3/50]   Loss: 6.7923
Current:  torch.Size([32]) tensor(1419, device='cuda:0')
Current:  torch.Size([128]) tensor(14658, device='cuda:0')
[VAL]  Epoch [3/50]   Loss: 6.1269
Saving model, predictions and generated output for epoch 1 with NLL: 6.1269197
[TRAIN]  Epoch [4/50]   Loss: 6.5058
Current:  torch.Size([32]) tensor(31353, device='cuda:0')
Current:  torch.Size([128]) tensor(31353, device='cuda:0')
[VAL]  Epoch [4/50]   Loss: 5.7611
Saving model, predictions and generated output for epoch 2 with NLL: 5.7611294
[TRAIN]  Epoch [5/50]   Loss: 6.3248
Current:  torch.Size([32]) tensor(31353, device='cuda:0')
Current:  torch.Size([128]) tensor(31353, device='cuda:0')
[VAL]  Epoch [5/50]   Loss: 5.6681
Saving mo

In [0]:
# Don't change these
# plot training curves
plt.figure()
plt.plot(range(1, trainer.epochs + 1), trainer.train_losses, label='Training losses')
plt.plot(range(1, trainer.epochs + 1), trainer.val_losses, label='Validation losses')
plt.xlabel('Epochs')
plt.ylabel('NLL')
plt.legend()
plt.show()

In [0]:
# see generated output
print (trainer.generated[-1]) # get last generated output
