In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

from torchtext.data import Field, BPTTIterator
from torchtext.datasets import PennTreebank
import spacy

import numpy as np
import pandas as pd
from tqdm import tqdm

# Use GPU?
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# https://pytorch.org/docs/stable/notes/randomness.html
# Reproducibility
torch.manual_seed(42) # Random Number Generator for all devices
# When running on the CuDNN backend, two further options must be set.
torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False

In [2]:
# Tokenizer
spacy_en = spacy.load('en')
def tokenize(s):
    return [tok.text for tok in spacy_en.tokenizer(s)]

# Prepare a field and get the data
TEXT = Field(lower=True, tokenize=tokenize)
train_data, valid_data, test_data = PennTreebank.splits(TEXT)

# Build the vocabulary
TEXT.build_vocab(train_data, min_freq=2)
print("Vocab size: {}".format(len(TEXT.vocab)))

downloading ptb.train.txt


.data\penn-treebank\ptb.train.txt: 5.10MB [00:02, 2.50MB/s]                    


downloading ptb.valid.txt


.data\penn-treebank\ptb.valid.txt: 400kB [00:00, 1.71MB/s]                     


downloading ptb.test.txt


.data\penn-treebank\ptb.test.txt: 450kB [00:00, 2.43MB/s]                      


Vocab size: 9703


In [3]:
batch_size = 40
bptt_len = 35

train_loader, valid_loader, test_loader = BPTTIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=batch_size, bptt_len=bptt_len, 
    device=device, 
    repeat=False
)

In [6]:
for i in train_loader:
    break

In [7]:
i


[torchtext.data.batch.Batch of size 40]
	[.text]:[torch.cuda.LongTensor of size 35x40 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 35x40 (GPU 0)]

In [9]:
i.text

tensor([[   6,    7,    5,  ...,  538,   98,    8],
        [   0,    6,    4,  ...,    2,  204,  633],
        [   0,   12,   82,  ...,  110,    2,  555],
        ...,
        [   3, 1148, 3450,  ...,    8,    5,   36],
        [   5,   13, 3446,  ...,    7,    4,  100],
        [   4,  151,  660,  ...,    6,   11,  371]], device='cuda:0')

In [10]:
i.target

tensor([[   0,    6,    4,  ...,    2,  204,  633],
        [   0,   12,   82,  ...,  110,    2,  555],
        [   0,  503,    7,  ...,   10,  542,  130],
        ...,
        [   5,   13, 3446,  ...,    7,    4,  100],
        [   4,  151,  660,  ...,    6,   11,  371],
        [   8, 3148,   63,  ...,    2, 4954,  142]], device='cuda:0')

In [11]:
import matplotlib.pyplot as plt
%matplotlib inline

In [19]:
(i.text[1:] - i.target[:-1]).sum()

tensor(0, device='cuda:0')

In [119]:
p = 0.9
i.text.new(5, 10).bernoulli_(1-p).float().div_(1-p)

tensor([[ 0., 10.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., 10.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], device='cuda:0')

In [120]:
class RNNDropout(nn.Module):
    
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p
        
    def forward(self, x):
        if not self.training or self.p == 0.:
            return x
        shape = (x.size(0), 1, x.size(2))
        m = self.dropout_mask(x.data, shape, self.p)
        return x * m
    
    @staticmethod
    def dropout_mask(x, sz, p):
        return x.new(*sz).bernoulli_(1-p).float().div_(1-p)

In [123]:
class HighwayBlock(nn.Module):
    
    def __init__(self,
                 in_features,
                 out_features,
                 first=False,
                 couple=False,
                 dropout_p=0.0):
        super().__init__()
        self.first = first
        self.couple = couple
        if first:
            self.W_H = nn.Linear(in_features, out_features, bias=False)
            self.W_T = nn.Linear(in_features, out_features, bias=False)
            if not couple:
                self.W_C = nn.Linear(in_features, out_features, bias=False)
        self.R_H = nn.Linear(in_features, out_features, bias=True)
        self.R_T = nn.Linear(in_features, out_features, bias=True)
        if not couple:
            self.R_C = nn.Linear(in_features, out_features, bias=True)
        self.dropout = RNNDropout(dropout_p)
        
    def forward(self, x, s):
        if self.first:
            h = torch.tanh(self.W_H(x) + self.R_H(x))
            t = torch.sigmoid(self.W_T(x) + self.R_T(x))
            if self.couple:
                c = 1 - t
            else:
                c = torch.sigmoid(self.W_C(x) + self.R_C(x))
        else:
            h = torch.tanh(self.R_H(x))
            t = torch.sigmoid(self.R_T(x))
            if self.couple:
                c = 1 - t
            else:
                c = torch.sigmoid(self.R_C(x))
        t = self.dropout(t.unsqueeze(0)).squeeze(0)
        
        return h * t + s * c

In [125]:
class RecurrentHighway(nn.Module):
    
    def __init__(self, 
                 in_features, 
                 out_features, 
                 recurrence_depth=5, 
                 couple=False, 
                 dropout_p=0.):
        super().__init__()
        highways = [
            HighwayBlock(in_features, out_features,
                         first=True if l == 0 else False,
                         couple=couple, dropout_p=dropout_p)
            for l in range(recurrence_depth)
        ]
        self.highways = nn.ModuleList(highways)
        self.recurrence_depth = recurrence_depth
        self.hidden_dim = out_features
        
    def forward(self, input, hidden):
        # Expects input dimensions [seq_len, bsz, input_dim]
        outputs = []
        for x in input:
            for block in self.highways:
                hidden = block(x, hidden)
            outputs.append(hidden)
        outputs = torch.stack(outputs)
        return outputs, hidden

In [157]:
class RHNLM(nn.Module):
    
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 hidden_dim,
                 recurrence_depth,
                 num_layers=1,
                 hidden_dp=0.65,
                 recur_dp=0.3,
                 tie_weights=True,
                 couple=False):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        rnns = [
            RecurrentHighway(
                embedding_dim if l == 0 else hidden_dim,
                embedding_dim if tie_weights and l != num_layers else hidden_dim,
                recurrence_depth=recurrence_depth,
                couple=couple,
                dropout_p=recur_dp)
            for l in range(num_layers)
        ]
        self.rnns = nn.ModuleList(rnns)
        self.fc1 = nn.Linear(embedding_dim if tie_weights else hidden_dim, vocab_size)
        self.hidden_dropout = RNNDropout(hidden_dp)
        if tie_weights:
            self.fc1.weight = self.embedding.weight
            
    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        hidden = [weight.new(bsz, rnn.hidden_dim).zero_() for rnn in self.rnns]
        return hidden
    
    def forward(self, x):
        bptt_len, bsz = x.shape
        vocab_size = self.embedding.num_embeddings
        
        out = self.embedding(x)
        
        hiddens = self.init_hidden(bsz)
        
        for i, rnn in enumerate(self.rnns):
            out, hidden = rnn(out, hiddens[i])
            out = self.hidden_dropout(out)
            
        out = self.fc1(out.flatten(0, 1))
        out = out.view(bptt_len, bsz, vocab_size)
        return out

In [158]:
model = RHNLM(
    vocab_size=len(TEXT.vocab),
    embedding_dim=300,
    hidden_dim=650,
    recurrence_depth=5,
    num_layers=2,
    recur_dp=0.3,
    hidden_dp=0.65,
    tie_weights=True,
    couple=True).to(device)

In [159]:
x = i.text

In [162]:
bptt_len, bsz = x.shape
bptt_len, bsz

(35, 40)

In [163]:
out = model.embedding(x)

In [167]:
hiddens = model.init_hidden(bsz)

In [178]:
hiddens[0].shape, hiddens[1].shape

(torch.Size([40, 300]), torch.Size([40, 300]))

In [174]:
model.rnns[0]

RecurrentHighway(
  (highways): ModuleList(
    (0): HighwayBlock(
      (W_H): Linear(in_features=300, out_features=300, bias=False)
      (W_T): Linear(in_features=300, out_features=300, bias=False)
      (R_H): Linear(in_features=300, out_features=300, bias=True)
      (R_T): Linear(in_features=300, out_features=300, bias=True)
      (dropout): RNNDropout()
    )
    (1): HighwayBlock(
      (R_H): Linear(in_features=300, out_features=300, bias=True)
      (R_T): Linear(in_features=300, out_features=300, bias=True)
      (dropout): RNNDropout()
    )
    (2): HighwayBlock(
      (R_H): Linear(in_features=300, out_features=300, bias=True)
      (R_T): Linear(in_features=300, out_features=300, bias=True)
      (dropout): RNNDropout()
    )
    (3): HighwayBlock(
      (R_H): Linear(in_features=300, out_features=300, bias=True)
      (R_T): Linear(in_features=300, out_features=300, bias=True)
      (dropout): RNNDropout()
    )
    (4): HighwayBlock(
      (R_H): Linear(in_features=300