In [1]:
!pip install -q torchdata==0.3.0 torchtext==0.12 spacy==3.2 altair GPUtil
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

[K     |████████████████████████████████| 47 kB 4.0 MB/s 
[K     |████████████████████████████████| 10.4 MB 35.4 MB/s 
[K     |████████████████████████████████| 6.0 MB 44.3 MB/s 
[K     |████████████████████████████████| 140 kB 54.5 MB/s 
[K     |████████████████████████████████| 750.6 MB 11 kB/s 
[K     |████████████████████████████████| 10.1 MB 56.0 MB/s 
[K     |████████████████████████████████| 660 kB 58.3 MB/s 
[K     |████████████████████████████████| 127 kB 62.6 MB/s 
[?25h  Building wheel for GPUtil (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.1+cu113 requires torch==1.12.1, but you have torch 1.11.0 which is incompatible.
torchaudio 0.12.1+cu113 requires torch==1.12.1, but you have torch 1.11.0 which is incompatible.
en-core-web-sm 3.4.0 requires spacy<3.5.0,>=3.4.0, but you have spacy 3

In [2]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP

# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [3]:
def is_interactive_notebook():
  return __name__ == "__main__"

def show_example(fn, args = []):
  if __name__ == "__main__" and RUN_EXAMPLES:
    return fn(*args)

def excute_example(fn, args = []):
  if __name__ == "__main__" and RUN_EXAMPLES:
    fn(*args)

class DummyOptimizer(torch.optim.Optimizer):

  def __init__(self):
    self.param_groups = [{"lr": 0}]
    None

  def step(self):
    None

  def zero_grad(self, set_to_none = False):
    None

class DummyScheduler:
  
  def step(self):
    None

In [4]:
class EncoderDecoder(nn.Module):

  def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
    super(EncoderDecoder, self).__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.src_embed = src_embed
    self.tgt_embed = tgt_embed
    self.generator = generator

  def forward(self, src, tgt, src_mask, tgt_mask):
    return self.decode(self.encoder(src, src_mask), src_mask, tgt, tgt_mask)

  def encode(self, src, src_mask):
    return self.encoder(self.src_embed(src), src_mask)
  
  def decode(self, memory, src_mask, tgt, tgt_mask):
    return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [5]:
class Generator(nn.Module):

  def __init__(self, d_model, vocab):
    super(Generator, self).__init__()
    self.proj = nn.Linear(d_model, vocab)

  def forward(self, x):
    return log_softmax(self.proj(x), dim = -1)

In [6]:
def clones(module, N):
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [7]:
class Encoder(nn.Module):

  def __init__(self, layer, N):
    super(Encoder, self).__init__()
    self.layers = clones(layer,  N)
    self.norm = LayerNorm(layer.size)

  def forward(self, x, mask):
    for layer in self.layers:
      x = layer(x, mask)
    return self.norm(x)

In [8]:
class LayerNorm(nn.Module):

  def __init__(self, features, eps = 1e-6):
    super(LayerNorm, self).__init__()
    self.a_2 = nn.Parameter(torch.ones(features))
    self.b_2 = nn.Parameter(torch.zeros(features))
    self.eps = eps

  def forward(self, x):
    mean = x.mean(-1, keepdim = True)
    std = x.std(-1, keepdim = True)
    return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [9]:
class SublayerConnection(nn.Module):

  def __init__(self, size, dropout):
    super(SublayerConnection, self).__init__()
    self.norm = LayerNorm(size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x, sublayer):
    return x + self.dropout(sublayer(self.norm(x)))

In [None]:
class EncoderLayer(nn.Module):

  def __init__(self, size, self_attn, feed_forward, dropout):
    super(EncoderLayer, self).__init__()
    self.self_attn = self_attn
    self.feed_forward = feed_forward
    self.sublayer = clones(SublayerConnection(size, dropout), 2)