In [1]:
import torch
import pandas
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import json, itertools
from datetime import datetime
torch.manual_seed(0)
tqdm.get_lock().locks = []



Just as a warm-up exercise let's confirm that quadratic-sized recurrent neural network is capable of reverting a simple sequence.
Just for the sake of simplicity the sequence will be one-hot encoded and put through a network with 2 recurrent layers of $seq\_length^2$ neurons and an output linear layer providing a final output with the next sequence item. 

TODO:
- [ ] enable teacher forcing randomization
- [ ] perform validation during the training procedure
- [ ] improve the code quality
- [x] work in batches
- [x] check border conditions
- [ ] improve progress reporting
- [x] try tensorboard output
- [x] switch to DVC-based experimentation
- [ ] implement sequence padding to enable variable sequence length capability in the model

In [2]:
config = {}
with open("00-reversal-config.json", 'r') as ch:
    config = json.load(ch)

src_sequences = torch.randint(config['VOCAB_SIZE'] - 2, [config['SAMPLES'], config['SEQ_LENGTH']])
reversed_sequences = src_sequences.flip(1)
print(src_sequences[:10])
print(reversed_sequences[:10])

tensor([[2, 0, 2, 0, 1, 0, 1, 1, 1, 0],
        [2, 2, 0, 0, 1, 2, 0, 0, 2, 0],
        [2, 2, 2, 2, 2, 2, 1, 2, 0, 0],
        [1, 0, 2, 0, 1, 0, 1, 2, 2, 1],
        [2, 0, 1, 1, 1, 2, 1, 1, 2, 1],
        [1, 2, 2, 1, 2, 2, 2, 1, 2, 2],
        [1, 1, 0, 0, 1, 2, 1, 1, 2, 1],
        [2, 2, 0, 1, 0, 2, 0, 1, 1, 2],
        [2, 2, 2, 1, 0, 2, 1, 1, 1, 2],
        [1, 1, 1, 1, 1, 2, 1, 1, 2, 1]])
tensor([[0, 1, 1, 1, 0, 1, 0, 2, 0, 2],
        [0, 2, 0, 0, 2, 1, 0, 0, 2, 2],
        [0, 0, 2, 1, 2, 2, 2, 2, 2, 2],
        [1, 2, 2, 1, 0, 1, 0, 2, 0, 1],
        [1, 2, 1, 1, 2, 1, 1, 1, 0, 2],
        [2, 2, 1, 2, 2, 2, 1, 2, 2, 1],
        [1, 2, 1, 1, 2, 1, 0, 0, 1, 1],
        [2, 1, 1, 0, 2, 0, 1, 0, 2, 2],
        [2, 1, 1, 1, 2, 0, 1, 2, 2, 2],
        [1, 2, 1, 1, 2, 1, 1, 1, 1, 1]])


In [3]:
src_sequences_one_hot = torch.zeros((config['SAMPLES'], config['SEQ_LENGTH'], config['VOCAB_SIZE'])).scatter(2, src_sequences.unsqueeze(2), 1.0)

In [4]:
class ReverseEncoder(nn.Module):
    def __init__(self, seq_length, vocab_size, rec_layers_count):
        super(ReverseEncoder, self).__init__()
        self.seq_length = seq_length
        self.vocab_size = vocab_size
        self.rec_layers_count = rec_layers_count 
        self.rec_layers = nn.RNN(
            input_size = vocab_size, 
            hidden_size = seq_length**2,
            nonlinearity = "tanh",
            num_layers = rec_layers_count,
            batch_first = True
        )
        
    def init_hidden_state(self, input_sequence):
        return torch.randn((self.rec_layers_count, input_sequence.size(0), self.seq_length**2))
        
    def forward(self, input_sequence):
        post_recurrent, hidden = self.rec_layers(input_sequence, self.init_hidden_state(input_sequence))

        return post_recurrent, hidden
    
class ReverseDecoder(nn.Module):
    def __init__(self, seq_length, vocab_size, rec_layers_count):
        super(ReverseDecoder, self).__init__()
        self.seq_length = seq_length
        self.vocab_size = vocab_size
        self.rec_layers_count = rec_layers_count 
        self.rec_layers = nn.RNN(
            input_size = vocab_size, 
            hidden_size = seq_length**2,
            nonlinearity = "tanh",
            num_layers = rec_layers_count,
            batch_first = True
        )
        self.output = nn.Linear(seq_length*seq_length, vocab_size)
        
    def forward(self, input_sequence, hidden_state):
        post_recurrent, hidden = self.rec_layers(input_sequence, hidden_state)
        item_probs = F.log_softmax(self.output(post_recurrent), dim=2)
        return item_probs, hidden

In [5]:
enc_model = ReverseEncoder(config['SEQ_LENGTH'], config['VOCAB_SIZE'], 2)
dec_model = ReverseDecoder(config['SEQ_LENGTH'], config['VOCAB_SIZE'], 2)

In [6]:
print(enc_model)
print(dec_model)

ReverseEncoder(
  (rec_layers): RNN(5, 100, num_layers=2, batch_first=True)
)
ReverseDecoder(
  (rec_layers): RNN(5, 100, num_layers=2, batch_first=True)
  (output): Linear(in_features=100, out_features=5, bias=True)
)


In [7]:
loss_function = nn.NLLLoss()
optimizer = torch.optim.Adam(
    list(enc_model.parameters()) + list(dec_model.parameters()), 
    lr = config['LEARNING_RATE']
)
writer = SummaryWriter()

In [8]:
SOS = torch.tensor(config['VOCAB_SIZE'] - 2)
EOS = torch.tensor(config['VOCAB_SIZE'] - 1)
SOS_filler = torch.cat((torch.zeros(config['VOCAB_SIZE'] - 2), torch.tensor([1.0, 0.0])))
EOS_filler = torch.cat((torch.zeros(config['VOCAB_SIZE'] - 1), torch.tensor([1.0])))

In [9]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [10]:
losses = {}
tstart = datetime.now()
for epoch in range(config['EPOCHS']):
    losses[epoch] = []
    chunked_X = chunks(src_sequences_one_hot, config['BATCH_SIZE'])
    chunked_y = chunks(reversed_sequences, config['BATCH_SIZE'])
    input_chunks = zip(chunked_X, chunked_y)
    with tqdm(list(input_chunks)) as cit:
        batch_element = 0
        for chunk_X, chunk_y in cit:
            loss = 0
            y_eos = EOS.unsqueeze(0).repeat(chunk_y.size(0)).view(-1,1)
            yss = torch.cat((chunk_y, y_eos), dim=1).unsqueeze(1)
            x_sos = SOS_filler.unsqueeze(0).repeat(chunk_X.size(0), 1).unsqueeze(1)
            single_batch_result_out, hidden = enc_model(chunk_X)
            Xss = torch.cat((x_sos, chunk_X.squeeze(0)), dim=1)

            for slice_id in range(chunk_y.size(1)):
                y_pred, hidden = dec_model(Xss[:, slice_id].unsqueeze(1), hidden)
                cur_loss = loss_function(y_pred.squeeze(1), yss[:, :, slice_id].squeeze(1))
                loss += cur_loss
            enc_model.zero_grad()
            dec_model.zero_grad()
            loss.backward()
            losses[epoch].append(loss.tolist())
            optimizer.step()
            cit.set_postfix({
                'epoch': f"{epoch+1}/{config['EPOCHS']}", 
                'mean_loss': sum(losses[epoch])/len(losses[epoch]),
                'last_loss': losses[epoch][-1]
            })

tend = datetime.now()
tdiff = tend - tstart

100%|██████████| 16/16 [00:00<00:00, 30.21it/s, epoch=1/35, mean_loss=12.7, last_loss=11.2]
100%|██████████| 16/16 [00:00<00:00, 35.61it/s, epoch=2/35, mean_loss=11, last_loss=11]   
100%|██████████| 16/16 [00:00<00:00, 43.78it/s, epoch=3/35, mean_loss=10.9, last_loss=10.7]
100%|██████████| 16/16 [00:00<00:00, 41.44it/s, epoch=4/35, mean_loss=10.3, last_loss=9.78]
100%|██████████| 16/16 [00:00<00:00, 25.98it/s, epoch=5/35, mean_loss=9, last_loss=7.87]   
100%|██████████| 16/16 [00:00<00:00, 33.75it/s, epoch=6/35, mean_loss=6.84, last_loss=5.48]
100%|██████████| 16/16 [00:00<00:00, 40.92it/s, epoch=7/35, mean_loss=4.05, last_loss=2.86]
100%|██████████| 16/16 [00:00<00:00, 43.48it/s, epoch=8/35, mean_loss=1.64, last_loss=0.985]
100%|██████████| 16/16 [00:00<00:00, 44.21it/s, epoch=9/35, mean_loss=0.562, last_loss=0.38] 
100%|██████████| 16/16 [00:00<00:00, 44.38it/s, epoch=10/35, mean_loss=0.254, last_loss=0.173]
100%|██████████| 16/16 [00:00<00:00, 43.95it/s, epoch=11/35, mean_loss=0.15

In [11]:
def reverse_sequence(seq, model):
    seq_one_hot = torch.zeros((1, config['SEQ_LENGTH'], config['VOCAB_SIZE'])).scatter(2, torch.tensor(seq).unsqueeze(0).unsqueeze(-1), 1.0)
    (_, hidden) = model[0](seq_one_hot)
    result = []
    out, hidden = model[1](SOS_filler.unsqueeze(0).unsqueeze(0), hidden)
    result.append(torch.argmax(out, dim=2).squeeze(0).squeeze(0).tolist())
    for seq_char in seq_one_hot.squeeze(0):
        out, hidden = model[1](seq_char.unsqueeze(0).unsqueeze(0), hidden)
        result.append(torch.argmax(out, dim=2).squeeze(0).squeeze(0).tolist())
    return result[:-1]

In [12]:
def const_generator(character_to_generate):
    while True:
        yield character_to_generate

def up_stairs_generator(cap = config['VOCAB_SIZE'] - 2, cur_character=0):
    while True:
        cur_character += 1
        if cur_character == cap + 1:
            cur_character = 0
        yield cur_character
        
def down_stairs_generator(cap = config['VOCAB_SIZE'] - 2, cur_character=0):
    while True:
        cur_character -= 1
        if cur_character < 0:
            cur_character = cap
        yield cur_character
        
def two_way_stairs_generator(cap = config['VOCAB_SIZE'] - 2, cur_character=0):
    delta = 1
    while True:
        cur_character += delta
        if cur_character >= cap:
            delta = -1
            cur_character = cap
        if cur_character <= 0:
            delta = 1
        yield cur_character



test_cases = []
for el in range(config['VOCAB_SIZE']):
    test_cases.append(list(itertools.islice(const_generator(el), config['SEQ_LENGTH'])))
    
for el in range(config['VOCAB_SIZE']):
    test_cases.append(list(itertools.islice(up_stairs_generator(el), config['SEQ_LENGTH'])))
    
for el in range(config['VOCAB_SIZE']):
    test_cases.append(list(itertools.islice(down_stairs_generator(el), config['SEQ_LENGTH'])))
        
for el in range(config['VOCAB_SIZE']):
    test_cases.append(list(itertools.islice(two_way_stairs_generator(el), config['SEQ_LENGTH'])))
    
for el in range(config['VOCAB_SIZE'] - 2, 0, -1):
    test_cases.append(list(itertools.islice(two_way_stairs_generator(el, cur_character = el), config['SEQ_LENGTH'])))


In [13]:
from difflib import SequenceMatcher

scores = ([SequenceMatcher(a = t, b = reverse_sequence(t, (enc_model, dec_model))[::-1]).ratio() for t in test_cases])
score = sum(scores)/len(scores)

In [14]:
with open("./00-seq-reversal-score.json", 'w') as ch:
    config = json.dump({"score": score, "training_time": tdiff.total_seconds()}, ch)