In [12]:
# coding: utf-8

# In[1]:

# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl numpy matplotlib torchtext

# In[2]:

# Standard PyTorch imports
import numpy as np

from tensorflow.contrib.layers import layer_norm
import nn_utils

# For plots
#get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '3'
import tensorflow as tf
real = 1
BATCH_SIZE = 64
#!conda install torchtext spacy
# !python -m spacy download en
# !python -m spacy download de
def detect_end(choice, eos_token=None):
    return choice.flatten()[0] == eos_token

if real:
  from torchtext import data
  from torchtext import datasets
  import tqdm
  import re
  import spacy

  spacy_de = spacy.load('de')
  spacy_en = spacy.load('en')

  url = re.compile('(<url>.*</url>)')

  def tokenize_de(text):
    return [tok.text.lower() for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

  def tokenize_en(text):
    return [tok.text.lower() for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

  # Testing IWSLT
  DE = data.Field(
      tokenize=tokenize_de,
      init_token='<bos>',
      eos_token='<eos>',
      include_lengths=True)
  EN = data.Field(
      tokenize=tokenize_en,
      init_token='<bos>',
      eos_token='<eos>',
      include_lengths=True)

  train, val, test = datasets.IWSLT.splits(
      exts=('.de', '.en'), fields=(DE, EN))

  train_it = data.Iterator(
      train,
      batch_size=BATCH_SIZE,
      sort_within_batch=True,
      train=True,
      repeat=False,
      shuffle=True)
  MIN_WORD_FREQ = 10
  MAX_NUM_WORDS = 10000
  DE.build_vocab(train.src, min_freq=MIN_WORD_FREQ, max_size=MAX_NUM_WORDS)
  EN.build_vocab(train.trg, min_freq=MIN_WORD_FREQ, max_size=MAX_NUM_WORDS)

  num_wds_input = len(DE.vocab.itos)
  num_wds_output = len(EN.vocab.itos)
else:
  num_wds_input = 1004


class masked_softmax:
  def __init__(self, v, mask, dim=2):
    #bs, query dimension, key dimension
    v_mask = v * mask
    v_max = tf.reduce_max(v_mask, dim, keep_dims=True)
    v_stable = v_mask - v_max

    v_exp = tf.exp(v_stable) * mask
    v_exp_sum = tf.reduce_sum(v_exp, dim, keep_dims=True)
    self.v_mask, self.v_max, self.v_stable, self.v_exp, self.v_exp_sum = \
        v_mask, v_max, v_stable, v_exp, v_exp_sum
    self.output = v_exp / (v_exp_sum + 1e-20)


class Encoder:
  def __init__(self, num_wds, wd_ind, mask, ndims=64, n_layers=6):
    self.num_wds = num_wds
    self.wd_ind = wd_ind
    self.mask = mask
    self.length = tf.shape(self.wd_ind)[1]
    self.wd_emb = tf.Variable(
        tf.random_uniform([self.num_wds, ndims], minval=-1, maxval=1.))
    self.wd_vec = tf.nn.embedding_lookup(self.wd_emb, wd_ind)
    self.pos = tf.reshape(
        tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32),
        (1, -1, 1))
    self.divider_exponent = tf.reshape(
        tf.range(tf.cast(ndims // 2, tf.float32)),
        (1, 1, -1)) * 2. / tf.cast(ndims, tf.float32)
    self.divider = tf.pow(10000., self.divider_exponent)
    self.input_to_sinusoids = self.pos / self.divider
    self.pos_sin = tf.sin(self.input_to_sinusoids)
    self.pos_cos = tf.cos(self.input_to_sinusoids)
    # self.position = tf.reshape(
    #     tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32) / 10000,
    #     (1, -1, 1))
    self.position = tf.concat((self.pos_sin, self.pos_cos), -1)
    self.w_tilde = embedding = self.wd_vec + self.position
    self.encoding = []
    self.attentionLayers = []
    for _ in range(n_layers):
      attentionLayer = AttentionLayer(embedding, mask)
      embedding = attentionLayer.output
      self.encoding.append(embedding)
      self.attentionLayers.append(attentionLayer)

class AttentionLayer:
    def __init__(self, X, mask, X_decode=None, decode_mask=None, ff_layer=True):
        bs, length, ndim = [v.value for v in X.shape]
        self.layers = layers = [
            AttentionSubLayer(
                X, mask, X_decode=X_decode, decode_mask=decode_mask, ff_layer=ff_layer)
            for _ in range(4)]
        self.sub_outputs = [l.a_compressed for l in layers]
        self.a_compressed = tf.concat((self.sub_outputs), -1)
        self.linearafterattn = tf.layers.dense(self.a_compressed, ndim)
        if X_decode is None:
          self.e = layer_norm(self.linearafterattn + X)
        else:
          self.e = layer_norm(self.linearafterattn + X_decode)
        if ff_layer:
          self.output = layer_norm(tf.layers.dense(tf.nn.leaky_relu(
              tf.layers.dense(self.e, ndim)),ndim) + self.e)
        else:
          self.output = self.e
    
class AttentionSubLayer:
  def __init__(self, X, mask, X_decode=None, decode_mask=None, ff_layer=True):
    bs, length, ndim = [v.value for v in X.shape]
    ndim = ndim // 4
    self.X = X
    if X_decode is None:
      self.q, self.k, self.v = [
          tf.tanh(tf.layers.dense(X, ndim)) for _ in range(3)
      ]
      decode_mask = mask
    else:
      self.q = tf.tanh(tf.layers.dense(X_decode, ndim))
      self.k, self.v = [tf.tanh(tf.layers.dense(X, ndim)) for _ in range(2)]
    #batch, attention queries, attention keys, embeddings
    self.q_expanded = tf.expand_dims(self.q, 2)
    self.k_expanded = tf.expand_dims(self.k, 1)
    self.v_expanded = tf.expand_dims(self.v, 1)
    self.s_raw = tf.reduce_sum(self.q_expanded * self.k_expanded, -1)/np.sqrt(ndim)
    self.mask = tf.expand_dims(decode_mask, 2) * tf.expand_dims(mask, 1)
    self.masked_softmax = masked_softmax(self.s_raw, self.mask)
    self.s = self.masked_softmax.output
    self.a = tf.expand_dims(self.s * self.mask, -1) * self.v_expanded
    #A is shape bs, query, key, emb
    self.a_compressed = tf.reduce_sum(self.a, 2)


class Decoder:
  def __init__(self, num_wds, wd_ind, mask, encoder, ndims=20, n_layers=6):
    self.num_wds = num_wds
    self.wd_ind = wd_ind
    self.mask = mask
    self.encoder = encoder
    self.length = tf.shape(self.wd_ind)[1]
    self.wd_emb = tf.Variable(
        tf.random_uniform([self.num_wds, ndims], minval=-1, maxval=1.))
    self.wd_vec = tf.nn.embedding_lookup(self.wd_emb, wd_ind)
    self.pos = tf.reshape(
        tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32),
        (1, -1, 1))
    self.divider_exponent = tf.reshape(
        tf.range(tf.cast(ndims // 2, tf.float32)),
        (1, 1, -1)) * 2. / tf.cast(ndims, tf.float32)
    self.divider = tf.pow(10000., self.divider_exponent)
    self.input_to_sinusoids = self.pos / self.divider
    self.pos_sin = tf.sin(self.input_to_sinusoids)
    self.pos_cos = tf.cos(self.input_to_sinusoids)
    self.position = tf.concat((self.pos_sin, self.pos_cos), -1)
    self.w_tilde = embedding = self.wd_vec + self.position
    self.decoding = []
    self.self_attentions = []
    self.encoder_attentions = []
    self.early_outputs = []
    for l_idx in range(n_layers):
      attn = AttentionLayer(embedding, mask, ff_layer=False)
      self.self_attentions.append(attn)
      encode_attn = AttentionLayer(encoder.encoding[l_idx], encoder.mask,
                                   attn.output, mask)
      self.encoder_attentions.append(encode_attn)
      embedding = encode_attn.output
      if l_idx < n_layers - 1:
        early_output = tf.layers.dense(embedding, num_wds)
        self.early_outputs.append(early_output)

    self.output_raw = tf.layers.dense(embedding, num_wds)

    self.outsoftmax = masked_softmax(self.output_raw, tf.expand_dims(mask, -1), dim=2)
    self.output = self.outsoftmax.output

class Transformer:
  def __init__(self, num_wds):
    self.num_wds = num_wds
    n_layers = 6
    ndims = 256
    self.learning_rate = tf.placeholder(tf.float32, None)
    self.wd_ind_src = wd_ind_src = tf.placeholder(tf.int32, (None, None))
    self.wd_ind_trg = wd_ind_trg = tf.placeholder(tf.int32, (None, None))
    self.input_lengths = tf.placeholder(tf.int32, [None])
    self.output_lengths = tf.placeholder(tf.int32, [None])
    self.input_mask = tf.sequence_mask(
        self.input_lengths,
        maxlen=tf.shape(self.wd_ind_src)[-1],
        dtype=tf.float32)
    self.output_mask = tf.sequence_mask(
        self.output_lengths,
        maxlen=tf.shape(self.wd_ind_trg)[-1],
        dtype=tf.float32)
    self.encoder = Encoder(
        num_wds, wd_ind_src, self.input_mask, n_layers=n_layers, ndims=ndims)
    self.decoder = Decoder(
        num_wds,
        wd_ind_trg,
        self.output_mask,
        self.encoder,
        n_layers=n_layers,
        ndims=ndims)
    opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
    self.prediction_mask = tf.concat((tf.zeros(
        (tf.shape(self.output_mask)[0], 1)), self.output_mask[:, :-1] - self.output_mask[:, 1:]),
                                     1)
    self.losses = tf.reduce_mean([tf.reduce_mean(
        tf.square(
            tf.reduce_max(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.wd_ind_trg, logits=logits) *
                self.prediction_mask, 1))) for logits in self.decoder.early_outputs + [self.decoder.output_raw]])
    self.loss = tf.reduce_mean(
        tf.square(
            tf.reduce_max(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.wd_ind_trg, logits=self.decoder.output_raw) *
                self.prediction_mask, 1)))
    self.optimizer, self.grad_norm_total = nn_utils.apply_clipped_optimizer(
        opt, self.losses)


# In[71]:

transformer = Transformer(num_wds_input)
MAX_LEN = 20


def predict_one(src_tensor, src_len, trg_tensor):
    NUM_MAX_DECODE = 8 #MAX_LEN
    src_len_decode = src_len[0:1]
    src_decode = src_tensor[0:1, :src_len_decode[0]]
    autoregressive = trg_tensor[0:1, 0:1]
    for dec_idx in range(NUM_MAX_DECODE):
        pred = sess.run(
            transformer.decoder.outsoftmax.output[:,-1,:], {
                transformer.wd_ind_src: src_decode,
                transformer.input_lengths: src_len_decode,
                transformer.wd_ind_trg: autoregressive,
                transformer.output_lengths: np.ones(1)*autoregressive.shape[1],
            })
        choice = pred.argmax(1)
        autoregressive = np.concatenate((autoregressive, np.expand_dims(choice, 0)), -1)
        if detect_end(choice, None):
            break
    translation = [EN.vocab.itos[a] for a in autoregressive.flatten()]
    print(translation)

def beamsearch(src_tensor, src_len, trg_tensor):
    NUM_MAX_DECODE = 8 #MAX_LEN
    src_len_decode = src_len[0:1]
    src_decode = src_tensor[0:1, :src_len_decode[0]]
    init_token = trg_tensor[0:1, 0:1]
    for dec_idx in range(NUM_MAX_DECODE):
        pred = sess.run(
            transformer.decoder.outsoftmax.output[:,-1,:], {
                transformer.wd_ind_src: src_decode,
                transformer.input_lengths: src_len_decode,
                transformer.wd_ind_trg: autoregressive,
                transformer.output_lengths: np.ones(1)*autoregressive.shape[1],
            })
        choice = pred.argmax(1)
        autoregressive = np.concatenate((autoregressive, np.expand_dims(choice, 0)), -1)
        if detect_end(choice, None):
            break
    translation = [EN.vocab.itos[a] for a in autoregressive.flatten()]
    print(translation)
    
def print_src(src_tensor, src_len, vocab):
    src_len_decode = src_len[0:1]
    src_decode = src_tensor[0:1, :src_len_decode[0]]
    translation = [vocab.vocab.itos[a] for a in src_decode.flatten()]
    print(translation)
    
    
sess = tf.Session()
sess.run(tf.global_variables_initializer())
itr = 0
print_freq = 100
running_losses = []
if real:
  #for itr, train_batch in enumerate(tqdm.tqdm_notebook(train_it)):
  for ep in range(100):
      for train_batch in tqdm.tqdm_notebook(train_it):
        itr += 1
        src_tensor = train_batch.src[0].data.cpu().numpy().transpose()
        src_len = train_batch.src[1].cpu().numpy()
        trg_tensor = train_batch.trg[0].data.cpu().numpy().transpose()
        trg_len = train_batch.trg[1].cpu().numpy()
        src_tensor, trg_tensor = [t[:, :MAX_LEN] for t in [src_tensor, trg_tensor]]
        src_len, trg_len = [np.clip(t, 0, MAX_LEN) for t in [src_len, trg_len]]
        trg_len = np.ceil(
            np.random.uniform(size=trg_len.shape[0]) * (trg_len - 1)).astype(int)
        trn_feed_dict = {
            transformer.wd_ind_src: src_tensor,
            transformer.input_lengths: src_len,
            transformer.wd_ind_trg: trg_tensor,
            transformer.output_lengths: trg_len,
            transformer.learning_rate: 1e-1 / (np.sqrt(itr + 3))
        }
        _, loss = sess.run([transformer.optimizer, transformer.loss],
                           trn_feed_dict)
        running_losses.append(loss)
        if itr % print_freq == 0 or (itr < 5) or (itr < 100 and itr % 10 == 0):
          running_losses = np.array(running_losses)
          print(itr, 'loss_mean', running_losses.mean(), 'loss_std', running_losses.std(), 'loss_min', running_losses.min(),
              'loss_max', running_losses.max())
          running_losses = []
          if itr % 100 == 0:
            print_src(src_tensor, src_len, DE)
            print_src(trg_tensor, trg_len, EN)
            predict_one(src_tensor, src_len, trg_tensor)
        if itr > 2000000:
          break
else:

  src_tensor = np.random.randint(low=0, high=num_wds_input, size=(BATCH_SIZE, 81))
  src_len = np.random.randint(2, 81, BATCH_SIZE)
  trg_tensor = np.random.randint(low=0, high=num_wds_input, size=(BATCH_SIZE, 84))
  trg_len = np.random.randint(2, 84, BATCH_SIZE)

  fd = {
      transformer.wd_ind_src: src_tensor,
      transformer.wd_ind_trg: trg_tensor,
      transformer.input_lengths: src_len,
      transformer.output_lengths: trg_len,
      transformer.learning_rate: 1e-2}
  sess.run([transformer.optimizer, transformer.loss], fd)
# In[75]:


A Jupyter Widget

1 loss_mean 84.57846 loss_std 0.0 loss_min 84.57846 loss_max 84.57846
2 loss_mean 82.0095 loss_std 0.0 loss_min 82.0095 loss_max 82.0095
3 loss_mean 103.72043 loss_std 0.0 loss_min 103.72043 loss_max 103.72043
4 loss_mean 52.54789 loss_std 0.0 loss_min 52.54789 loss_max 52.54789
10 loss_mean 60.01445 loss_std 6.029169 loss_min 52.2023 loss_max 70.08912
20 loss_mean 51.405678 loss_std 4.52098 loss_min 41.711365 loss_max 59.912785
30 loss_mean 46.558437 loss_std 2.2536125 loss_min 43.153755 loss_max 51.485588
40 loss_mean 47.571346 loss_std 4.1877704 loss_min 41.815277 loss_max 55.278114
50 loss_mean 45.561466 loss_std 4.256003 loss_min 40.166565 loss_max 53.251667
60 loss_mean 46.226063 loss_std 5.6653423 loss_min 37.126755 loss_max 54.988613
70 loss_mean 44.585926 loss_std 6.752393 loss_min 31.409367 loss_max 55.0481
80 loss_mean 44.872345 loss_std 2.6572459 loss_min 40.076492 loss_max 50.08348
90 loss_mean 44.51847 loss_std 5.3046503 loss_min 34.225582 loss_max 50.843494
100 loss_mean

KeyboardInterrupt: 

In [112]:
def beam_search():
    NUM_MAX_DECODE = 8 #MAX_LEN
    src_len_decode = src_len[0:1]
    src_decode = src_tensor[0:1, :src_len_decode[0]]
    init_token = trg_tensor[0:1, 0:1]
    pred = sess.run(
            transformer.decoder.outsoftmax.output[:,-1,:], {
                transformer.wd_ind_src: src_decode,
                transformer.input_lengths: src_len_decode,
                transformer.wd_ind_trg: init_token,
                transformer.output_lengths: np.ones(1)*init_token.shape[1],
            })
    n = 2
    index = pred.argsort()[:,-n:]
    probabilities = pred[:,index]
    active_sentences = [np.concatenate((init_token, np.expand_dims(i, 0)), -1) for i in index.T]
    active_probabilities = probabilities
    for i in range(2):
        active_sentences, active_probabilities = step_beam(active_sentences, active_probabilities)
        print(active_probabilities, active_sentences)
    translation = [EN.vocab.itos[a] for a in active_sentences[-1].flatten()]
    print(translation)

beam_search()

[0.9976023 1.9868424] [array([[ 2, 16, 15]]), array([[ 2, 15, 15]])]
[1.9910235 2.9802637] [array([[ 2, 16, 15, 15]]), array([[ 2, 15, 15, 15]])]
['<bos>', 'you', 'you', 'you']


In [93]:

sum_prob = []
sentences = []
for choice, prob in zip(active_sentences, active_probabilities.ravel()):
    print(choice, prob)
    pred = sess.run(
        transformer.decoder.outsoftmax.output[:,-1,:], {
            transformer.wd_ind_src: src_decode,
            transformer.input_lengths: src_len_decode,
            transformer.wd_ind_trg: choice,
            transformer.output_lengths: np.ones(1)*choice.shape[1],
        })
    index = pred.argsort()[:,-n:]
    probabilities = pred[:,index]
    sum_prob.append(probabilities + prob)
    sub_sentences = [np.concatenate((choice, np.expand_dims(i, 0)), -1) for i in index.T]
    sentences.append(sub_sentences)

[[ 2 16]] 0.004180922
[[ 2 15]] 0.99342126


In [87]:
sum_prob = np.concatenate([v.ravel() for v in sum_prob], 0)

In [101]:
import itertools
sentences = list(itertools.chain.from_iterable(sentences))

In [111]:
def step_beam(active_sentences, active_probabilities):
    sum_prob = []
    sentences = []
    for choice, prob in zip(active_sentences, active_probabilities.ravel()):
#     print(choice, prob)
        pred = sess.run(
            transformer.decoder.outsoftmax.output[:,-1,:], {
                transformer.wd_ind_src: src_decode,
                transformer.input_lengths: src_len_decode,
                transformer.wd_ind_trg: choice,
                transformer.output_lengths: np.ones(1)*choice.shape[1],
            })
        index = pred.argsort()[:,-n:]
        probabilities = pred[:,index]
        sum_prob.append(probabilities + prob)
        sub_sentences = [np.concatenate((choice, np.expand_dims(i, 0)), -1) for i in index.T]
        sentences.append(sub_sentences)
    sum_prob = np.concatenate([v.ravel() for v in sum_prob], 0)
    sentences = list(itertools.chain.from_iterable(sentences))
    index = sum_prob.argsort()[-n:]
    return [sentences[i] for i in index], np.array([sum_prob[i] for i in index])

In [102]:
sentences

[array([[ 2, 16, 16]]),
 array([[ 2, 16, 15]]),
 array([[ 2, 15, 16]]),
 array([[ 2, 15, 15]])]

In [96]:
sentences
np.concatenate([np.array(v).ravel() for v in senteces], 0)

array([ 2, 16, 16,  2, 16, 15,  2, 15, 16,  2, 15, 15])

In [79]:
sum_prob

[array([[[0.00277754, 0.00556968, 0.99481004]]], dtype=float32),
 array([[[0.00556969, 0.00836184, 0.99760216]]], dtype=float32),
 array([[[0.99481004, 0.99760216, 1.9868425 ]]], dtype=float32)]

In [68]:
active_sentences

[array([[ 2, 12]]), array([[ 2, 16]]), array([[ 2, 15]])]

In [63]:
active_probabilities, active_sentences

(array([[[0.00138879, 0.004181  , 0.99342114]]], dtype=float32),
 [array([[ 2, 12]]), array([[ 2, 16]]), array([[ 2, 15]])])

In [48]:
for i in index:
    print(i)

[12 16 15]


In [59]:
index.shape

(1, 3)

In [58]:
np.expand_dims(index[0,0], 0).shape

(1,)

In [53]:
np.concatenate((init_token, np.expand_dims(index[0,0], 0)), -1)

ValueError: all the input arrays must have same number of dimensions

In [None]:

for dec_idx in range(NUM_MAX_DECODE):
    pred = sess.run(
        transformer.decoder.outsoftmax.output[:,-1,:], {
            transformer.wd_ind_src: src_decode,
            transformer.input_lengths: src_len_decode,
            transformer.wd_ind_trg: autoregressive,
            transformer.output_lengths: np.ones(1)*autoregressive.shape[1],
        })
    choice = pred.argmax(1)
    autoregressive = np.concatenate((autoregressive, np.expand_dims(choice, 0)), -1)
    if detect_end(choice, None):
        break
translation = [EN.vocab.itos[a] for a in autoregressive.flatten()]
print(translation)

In [6]:

print_src(src_tensor, src_len, DE)
print_src(trg_tensor, trg_len, EN)
predict_one(src_tensor, src_len, trg_tensor)

['<bos>', 'sie', 'werden', 'auch', 'unter', 'hohen', 'temperaturen', 'betrieben', 'und', 'somit', 'kann', 'der', '<unk>', '<unk>', 'keine', '<unk>', 'erzeugen', ',', 'aber', 'falls']
['<bos>', 'also', ',', 'they', 'operate']
['<bos>', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',']


In [None]:
How how

In [None]:
NUM_MAX_DECODE = MAX_LEN

In [None]:

src_len_decode = src_len[0:1]
src_decode = src_tensor[0:1, :src_len_decode[0]]
autoregressive = trg_tensor[0:1, 0:1]


In [None]:
for dec_idx in range(10):
    pred = sess.run(
        transformer.decoder.outsoftmax.output[:,-1,:], {
            transformer.wd_ind_src: src_decode,
            transformer.input_lengths: src_len_decode,
            transformer.wd_ind_trg: autoregressive,
            transformer.output_lengths: np.ones(1)*autoregressive.shape[1],
        })
    choice = pred.argmax(1)
    autoregressive = np.concatenate((autoregressive, np.expand_dims(choice, 0)), -1)
    if detect_end(choice, None):
        break

In [None]:
pred.argmax(1)

In [None]:
pred

In [None]:
translation = [EN.vocab.itos[a] for a in autoregressive.flatten()]

In [None]:
translation

In [None]:
pred[:,12:20]