# Sequence to Sequence

## Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib
import matplotlib_inline
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import matplotlib.patches as patches

# Dataset's Iteration Performance
from tqdm import tqdm

# Time
import time

# OS
import re
import sys
import json
import string
import unicodedata
from glob import glob
from io import BytesIO
from imageio import imread
from zipfile import ZipFile
import requests, zipfile, io
from urllib.request import urlopen



# SciPy
from scipy.signal import convolve

# PyTorch
import torch
import torchvision
import torch.nn as nn
from torch.utils.data import *
from torchvision.ops import nms
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator


# IDLMAM Libraries
from idlmam import moveTo, run_epoch, set_seed, View, pad_and_pack
from idlmam import train_simple_network, set_seed, Flatten, weight_reset, train_network
from idlmam import LanguageNameDataset, pad_and_pack, EmbeddingPackable, LastTimeStep, LambdaLayer
from idlmam import AttentionAvg, GeneralScore, DotScore, AdditiveAttentionScore, ApplyAttention, getMaskByFill


# Scikit-Learn
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

#  IPython Display
from IPython.display import Latex
from IPython.display import display_pdf
from IPython.display import set_matplotlib_formats

  from .autonotebook import tqdm as notebook_tqdm


### Visualization Set-Up

In [2]:
%matplotlib inline

matplotlib_inline.backend_inline.set_matplotlib_formats('png', 'pdf')

### Setting Seeds

In [3]:
torch.backends.cudnn.deterministic=True

set_seed(42)

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Sequence-to-Sequence as a Kind of Denoising AutoEncoder

### Loading Data: A Small English-French Dataset

In [5]:
# Batch Size
B = 128

# Epochs
epochs = 10

In [6]:
all_data = []

resp = urlopen("https://download.pytorch.org/tutorial/data.zip")

zipfile = ZipFile(BytesIO(resp.read()))

for line in zipfile.open("data/eng-fra.txt").readlines():
    line = line.decode('utf-8').lower()
    line = re.sub(r"[-.!?]+", r" ", line)
    source_lang, target_lang = line.split("\t")[0:2]
    all_data.append( (source_lang.strip(), target_lang.strip()) ) 

In [7]:
for i in range(10):
    print(all_data[i])

('go', 'va')
('run', 'cours')
('run', 'courez')
('wow', 'ça alors')
('fire', 'au feu')
('help', "à l'aide")
('jump', 'saute')
('stop', 'ça suffit')
('stop', 'stop')
('stop', 'arrête toi')


In [8]:
short_subset = [] 

MAX_LEN = 6

for (s, t) in all_data:
    if max(len(s.split(" ")), len(t.split(" "))) <= MAX_LEN:
        short_subset.append((s,t))

print("Using ", len(short_subset), "/", len(all_data))

Using  66251 / 135842


Building The Alphabet:

In [9]:
SOS_token = "<SOS>"

EOS_token = "<EOS>" 

PAD_token = "_PADDING_"

word2indx = {PAD_token:0, SOS_token:1, EOS_token:2}
for s, t in short_subset:
    for sentance in (s, t):
        for word in sentance.split(" "):
            if word not in word2indx:
                word2indx[word] = len(word2indx)

print("Size of Vocab: ", len(word2indx))


indx2word = {}

for word, indx in word2indx.items():
    indx2word[indx] = word

Size of Vocab:  24577


Implementing a Translation Dataset:

In [10]:
class TranslationDataset(Dataset):

    def __init__(self, lang_pairs, word2indx):
        self.lang_pairs = lang_pairs
        self.word2indx = word2indx

    def __len__(self):
        return len(self.lang_pairs)

    def __getitem__(self, idx):
        x, y = self.lang_pairs[idx]
        x = SOS_token + " " + x + " " + EOS_token
        y = y + " " + EOS_token
        
        x = [self.word2indx[w] for w in x.split(" ")]
        y = [self.word2indx[w] for w in y.split(" ")]
        
        x = torch.tensor(x, dtype=torch.int64)
        y = torch.tensor(y, dtype=torch.int64)
        
        return x, y

bigdataset = TranslationDataset(short_subset, word2indx)

Implementing a Collate Function for Translation Data:

In [11]:
set_seed(42)

In [12]:
train_size = round(len(bigdataset)*0.9)

test_size = len(bigdataset)-train_size

train_dataset, test_dataset = torch.utils.data.random_split(bigdataset, [train_size, test_size])

In [13]:
def pad_batch(batch):
    max_x = max([i[0].size(0) for i in batch])
    max_y = max([i[1].size(0) for i in batch])
    
    PAD = word2indx[PAD_token]
    
    X = [F.pad(i[0], (0,max_x-i[0].size(0)), value=PAD) for i in batch]
    Y = [F.pad(i[1], (0,max_y-i[1].size(0)), value=PAD) for i in batch]
    
    X, Y = torch.stack(X), torch.stack(Y)
    
    return (X, Y), Y

In [14]:
# Data Loader
train_loader = DataLoader(train_dataset, batch_size=B, shuffle=True, collate_fn=pad_batch)

test_loader = DataLoader(test_dataset, batch_size=B, collate_fn=pad_batch)

## Seq2Seq with Attention

### Implementing Seq2Seq

In [15]:
class Seq2SeqAttention(nn.Module):

    def __init__(self, num_embeddings, embd_size, hidden_size, padding_idx=None, layers=1, max_decode_length=20):
        super(Seq2SeqAttention, self).__init__()
        self.padding_idx = padding_idx
        self.hidden_size = hidden_size
        self.embd = nn.Embedding(num_embeddings, embd_size, padding_idx=padding_idx)
        
        self.encode_layers = nn.GRU(input_size=embd_size, hidden_size=hidden_size//2, 
                                       num_layers=layers, bidirectional=True)

        self.decode_layers = nn.ModuleList([nn.GRUCell(embd_size, hidden_size)] + 
                                     [nn.GRUCell(hidden_size, hidden_size) for i in range(layers-1)])
        self.score_net = DotScore(hidden_size)
        
        self.predict_word = nn.Sequential(
            nn.Linear(2*hidden_size, hidden_size),
            nn.LeakyReLU(),
            nn.LayerNorm(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.LeakyReLU(),
            nn.LayerNorm(hidden_size),
            nn.Linear(hidden_size, num_embeddings)
        )
        self.max_decode_length = max_decode_length
        self.apply_attn = ApplyAttention()
    
    def forward(self, input):
        if isinstance(input, tuple):
            input, target = input
        else:
            target = None
        B = input.size(0)
        T = input.size(1)

        x = self.embd(input) 

        device = x.device

        mask = getMaskByFill(x)

        seq_lengths = mask.sum(dim=1).view(-1) 
        x_packed = pack_padded_sequence(x, seq_lengths.cpu(), batch_first=True, enforce_sorted=False)
        h_encoded, h_last = self.encode_layers(x_packed)
        h_encoded, _ = pad_packed_sequence(h_encoded) 
        h_encoded = h_encoded.view(B, T, -1) 

        hidden_size = h_encoded.size(2) 
        h_last = h_last.view(-1, 2, B, hidden_size//2)[-1,:,:,:] 
        h_last = h_last.permute(1, 0, 2).reshape(B, -1)
        h_prevs = [h_last for l in range(len(self.decode_layers))]

        all_attentions = []
        all_predictions = []

        decoder_input = self.embd(input.gather(1,seq_lengths.view(-1,1)-1).flatten()) 

        steps = min(self.max_decode_length, T)
        if target is not None: 
            steps = target.size(1)
        
        teacher_forcing = np.random.choice((True,False))
        for t in range(steps):
            x_in = decoder_input 

            for l in range(len(self.decode_layers)):
                h_prev = h_prevs[l] 
                h = self.decode_layers[l](x_in, h_prev)

                h_prevs[l] = h
                x_in = h
            h_decoder = x_in 

            scores = self.score_net(h_encoded, h_decoder) 
            context, weights = self.apply_attn(h_encoded, scores, mask=mask)

            all_attentions.append( weights.detach() ) 
            word_pred = torch.cat((context, h_decoder), dim=1) 
            word_pred = self.predict_word(word_pred) 
            all_predictions.append(word_pred)
    
            with torch.no_grad():
                if self.training:
                    if target is not None and teacher_forcing:
                        next_words = target[:,t].squeeze()
                    else:
                        next_words = torch.multinomial(F.softmax(word_pred, dim=1), 1)[:,-1]
                else:
                    next_words = torch.argmax(word_pred, dim=1)
            
            decoder_input = self.embd(next_words.to(device))
    
        if self.training: 
            return torch.stack(all_predictions, dim=1)
        else:
            return torch.stack(all_predictions, dim=1), torch.stack(all_attentions, dim=1).squeeze()

### Training & Evaluation 

In [16]:
set_seed(42)

In [17]:
epochs = 20

seq2seq = Seq2SeqAttention(len(word2indx), 64, 256, 
padding_idx=word2indx[PAD_token], 
layers=3, 
max_decode_length=MAX_LEN+2)

for p in seq2seq.parameters():
    p.register_hook(lambda grad: torch.clamp(grad, -10, 10))

Loss Function:

In [18]:
def CrossEntLossTime(x, y):
    if isinstance(x, tuple):
        x, _ = x
    cel = nn.CrossEntropyLoss(ignore_index=word2indx[PAD_token])
    T = min(x.size(1), y.size(1))
    
    loss = 0
    for t in range(T):
        loss += cel(x[:,t,:], y[:,t])
    return loss

In [20]:
# seq2seq_results = train_network(seq2seq, 
# CrossEntLossTime, 
# train_loader, 
# epochs=epochs, 
# device=device)

In [21]:
sns.lineplot(x='epoch', y='train loss', data=seq2seq_results, label='Seq2Seq')

plt.grid(True)
plt.show()

Visualizing Attention Score Maps:

In [22]:
def plot_heatmap(src, trg, scores):
    fig, ax = plt.subplots()
    heatmap = ax.pcolor(scores, cmap='gray')

    ax.set_xticklabels(trg, minor=False, rotation='vertical')
    ax.set_yticklabels(src, minor=False)
    ax.xaxis.tick_top()
    ax.set_xticks(np.arange(scores.shape[1]) + 0.5, minor=False)
    ax.set_yticks(np.arange(scores.shape[0]) + 0.5, minor=False)
    ax.invert_yaxis()

    plt.colorbar(heatmap)
    plt.show()

Sequence to Sequence Model Evaluation:

In [23]:
seq2seq = seq2seq.eval().cpu()

def results(indx):
    eng_x, french_y = test_dataset[indx]
    eng_str = " ".join([indx2word[i] for i in eng_x.cpu().numpy()])
    french_str = " ".join([indx2word[i] for i in french_y.cpu().numpy()])
    print("Input:     ", eng_str)
    print("Target:    ", french_str)
    
    with torch.no_grad():
        preds, attention = seq2seq(eng_x.unsqueeze(0))
        p = torch.argmax(preds, dim=2)
    pred_str = " ".join([indx2word[i] for i in p[0,:].cpu().numpy()])
    print("Predicted: ", pred_str)
    plot_heatmap(eng_str.split(" "), pred_str.split(" "), attention.T.cpu().numpy())

In [24]:
results(12) 

In [25]:
results(13) 

In [26]:
results(16) 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=5ce069ec-7808-4ddd-a8a7-20abad0ac4e2' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>