In [1]:
# coding: utf-8

# In[1]:

# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl numpy matplotlib torchtext

# In[2]:

# Standard PyTorch imports
import numpy as np

from tensorflow.contrib.layers import layer_norm
import nn_utils

# For plots
#get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt

import tensorflow as tf
real = 1
BATCH_SIZE = 16
#!conda install torchtext spacy
# !python -m spacy download en
# !python -m spacy download de
if real:
  from torchtext import data
  from torchtext import datasets
  import tqdm
  import re
  import spacy

  spacy_de = spacy.load('de')
  spacy_en = spacy.load('en')

  url = re.compile('(<url>.*</url>)')

  def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

  def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

  # Testing IWSLT
  DE = data.Field(
      tokenize=tokenize_de,
      init_token='<bos>',
      eos_token='<eos>',
      include_lengths=True)
  EN = data.Field(
      tokenize=tokenize_en,
      init_token='<bos>',
      eos_token='<eos>',
      include_lengths=True)

  train, val, test = datasets.IWSLT.splits(
      exts=('.de', '.en'), fields=(DE, EN))

  train_it = data.Iterator(
      train,
      batch_size=BATCH_SIZE,
      sort_within_batch=True,
      train=True,
      repeat=False,
      shuffle=True)
  MIN_WORD_FREQ = 10
  MAX_NUM_WORDS = 1000
  DE.build_vocab(train.src, min_freq=MIN_WORD_FREQ, max_size=MAX_NUM_WORDS)
  EN.build_vocab(train.trg, min_freq=MIN_WORD_FREQ, max_size=MAX_NUM_WORDS)

  num_wds_input = len(DE.vocab.itos)
  num_wds_output = len(EN.vocab.itos)
else:
  num_wds_input = 1004


class masked_softmax:
  def __init__(self, v, mask, dim=2):
    #bs, query dimension, key dimension
    v_mask = v * mask
    v_max = tf.reduce_max(v_mask, dim, keep_dims=True)
    v_stable = v_mask - v_max

    v_exp = tf.exp(v_stable) * mask
    v_exp_sum = tf.reduce_sum(v_exp, dim, keep_dims=True)
    self.v_mask, self.v_max, self.v_stable, self.v_exp, self.v_exp_sum = \
        v_mask, v_max, v_stable, v_exp, v_exp_sum
    self.output = v_exp / (v_exp_sum + 1e-20)


class Encoder:
  def __init__(self, num_wds, wd_ind, mask, ndims=64, n_layers=6):
    self.num_wds = num_wds
    self.wd_ind = wd_ind
    self.mask = mask
    self.length = tf.shape(self.wd_ind)[1]
    self.wd_emb = tf.Variable(
        tf.random_uniform([self.num_wds, ndims], minval=-1, maxval=1.))
    self.wd_vec = tf.nn.embedding_lookup(self.wd_emb, wd_ind)
    self.pos = tf.reshape(
        tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32),
        (1, -1, 1))
    self.divider_exponent = tf.reshape(
        tf.range(tf.cast(ndims // 2, tf.float32)),
        (1, 1, -1)) * 2. / tf.cast(ndims, tf.float32)
    self.divider = tf.pow(10000., self.divider_exponent)
    self.input_to_sinusoids = self.pos / self.divider
    self.pos_sin = tf.sin(self.input_to_sinusoids)
    self.pos_cos = tf.cos(self.input_to_sinusoids)
    # self.position = tf.reshape(
    #     tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32) / 10000,
    #     (1, -1, 1))
    self.position = tf.concat((self.pos_sin, self.pos_cos), -1)
    self.w_tilde = embedding = self.wd_vec + self.position
    self.encoding = []
    self.attentionLayers = []
    for _ in range(n_layers):
      attentionLayer = AttentionLayer(embedding, mask)
      embedding = attentionLayer.output
      self.encoding.append(embedding)
      self.attentionLayers.append(attentionLayer)


class AttentionLayer:
  def __init__(self, X, mask, X_decode=None, decode_mask=None, ff_layer=True):
    bs, length, ndim = [v.value for v in X.shape]
    self.X = X
    if X_decode is None:
      self.q, self.k, self.v = [
          tf.tanh(tf.layers.dense(X, ndim)) for _ in range(3)
      ]
      decode_mask = mask
    else:
      self.q = tf.tanh(tf.layers.dense(X_decode, ndim))
      self.k, self.v = [tf.tanh(tf.layers.dense(X, ndim)) for _ in range(2)]
    #batch, attention queries, attention keys, embeddings
    self.q_expanded = tf.expand_dims(self.q, 2)
    self.k_expanded = tf.expand_dims(self.k, 1)
    self.v_expanded = tf.expand_dims(self.v, 1)
    self.s_raw = tf.reduce_sum(self.q_expanded * self.k_expanded, -1)
    self.mask = tf.expand_dims(decode_mask, 2) * tf.expand_dims(mask, 1)
    self.masked_softmax = masked_softmax(self.s_raw, self.mask)
    self.s = self.masked_softmax.output
    self.a = tf.expand_dims(self.s * self.mask, -1) * self.v_expanded
    #A is shape bs, query, key, emb
    self.a_compressed = tf.reduce_sum(self.a, 2)
    if X_decode is None:
      self.e = layer_norm(self.a_compressed + X)
    else:
      self.e = layer_norm(self.a_compressed + X_decode)
    if ff_layer:
      self.output = layer_norm(tf.layers.dense(self.e, ndim) + self.e)
    else:
      self.output = self.e


class Decoder:
  def __init__(self, num_wds, wd_ind, mask, encoder, ndims=20, n_layers=6):
    self.num_wds = num_wds
    self.wd_ind = wd_ind
    self.mask = mask
    self.encoder = encoder
    self.length = tf.shape(self.wd_ind)[1]
    self.wd_emb = tf.Variable(
        tf.random_uniform([self.num_wds, ndims], minval=-1, maxval=1.))
    self.wd_vec = tf.nn.embedding_lookup(self.wd_emb, wd_ind)
    self.pos = tf.reshape(
        tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32),
        (1, -1, 1))
    self.divider_exponent = tf.reshape(
        tf.range(tf.cast(ndims // 2, tf.float32)),
        (1, 1, -1)) * 2. / tf.cast(ndims, tf.float32)
    self.divider = tf.pow(10000., self.divider_exponent)
    self.input_to_sinusoids = self.pos / self.divider
    self.pos_sin = tf.sin(self.input_to_sinusoids)
    self.pos_cos = tf.cos(self.input_to_sinusoids)
    # self.position = tf.reshape(
    #     tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32) / 10000,
    #     (1, -1, 1))
    self.position = tf.concat((self.pos_sin, self.pos_cos), -1)
    self.w_tilde = embedding = self.wd_vec + self.position
    self.decoding = []
    self.self_attentions = []
    self.encoder_attentions = []
    for l_idx in range(n_layers):
      attn = AttentionLayer(embedding, mask, ff_layer=False)
      self.self_attentions.append(attn)
      encode_attn = AttentionLayer(encoder.encoding[l_idx], encoder.mask,
                                   attn.output, mask)
      self.encoder_attentions.append(encode_attn)
      embedding = encode_attn.output

    self.output_raw = tf.layers.dense(embedding, num_wds)
    #bs, word in sentence of target, embedding

    self.outsoftmax = masked_softmax(self.output_raw, tf.expand_dims(mask, -1), dim=2)
    self.output = self.outsoftmax.output

class Transformer:
  def __init__(self, num_wds):
    self.num_wds = num_wds
    n_layers = 6
    ndims = 256
    self.learning_rate = tf.placeholder(tf.float32, None)
    self.wd_ind_src = wd_ind_src = tf.placeholder(tf.int32, (None, None))
    self.wd_ind_trg = wd_ind_trg = tf.placeholder(tf.int32, (None, None))
    self.input_lengths = tf.placeholder(tf.int32, [None])
    self.output_lengths = tf.placeholder(tf.int32, [None])
    self.input_mask = tf.sequence_mask(
        self.input_lengths,
        maxlen=tf.shape(self.wd_ind_src)[-1],
        dtype=tf.float32)
    self.output_mask = tf.sequence_mask(
        self.output_lengths,
        maxlen=tf.shape(self.wd_ind_trg)[-1],
        dtype=tf.float32)
    self.encoder = Encoder(
        num_wds, wd_ind_src, self.input_mask, n_layers=n_layers, ndims=ndims)
    self.decoder = Decoder(
        num_wds,
        wd_ind_trg,
        self.output_mask,
        self.encoder,
        n_layers=n_layers,
        ndims=ndims)
    opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
    self.prediction_mask = tf.concat((tf.zeros(
        (BATCH_SIZE, 1)), self.output_mask[:, :-1] - self.output_mask[:, 1:]),
                                     1)
    self.loss = tf.reduce_mean(
        tf.square(
            tf.reduce_max(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.wd_ind_trg, logits=self.decoder.output_raw) *
                self.prediction_mask, 1)))
    self.optimizer, self.grad_norm_total = nn_utils.apply_clipped_optimizer(
        opt, self.loss)


# In[71]:

transformer = Transformer(num_wds_input)
MAX_LEN = 20

sess = tf.Session()
sess.run(tf.global_variables_initializer())
if real:
  for itr, train_batch in enumerate(tqdm.tqdm_notebook(train_it)):
    src_tensor = train_batch.src[0].data.cpu().numpy().transpose()
    src_len = train_batch.src[1].cpu().numpy()
    trg_tensor = train_batch.trg[0].data.cpu().numpy().transpose()
    trg_len = train_batch.trg[1].cpu().numpy()
    src_tensor, trg_tensor = [t[:, :MAX_LEN] for t in [src_tensor, trg_tensor]]
    src_len, trg_len = [np.clip(t, 0, MAX_LEN) for t in [src_len, trg_len]]
    trg_len = np.ceil(
        np.random.uniform(size=BATCH_SIZE) * (trg_len - 1)).astype(int)
    trn_feed_dict = {
        transformer.wd_ind_src: src_tensor,
        transformer.input_lengths: src_len,
        transformer.wd_ind_trg: trg_tensor,
        transformer.output_lengths: trg_len,
        transformer.learning_rate: 1e-2 / (np.sqrt(itr + 3))
    }
    _, loss = sess.run([transformer.optimizer, transformer.loss],
                       trn_feed_dict)
    if itr % 1000 == 0:
      print(itr, loss)
    if itr > 20000:
      break
else:

  src_tensor = np.random.randint(low=0, high=num_wds_input, size=(BATCH_SIZE, 81))
  src_len = np.random.randint(2, 81, BATCH_SIZE)
  trg_tensor = np.random.randint(low=0, high=num_wds_input, size=(BATCH_SIZE, 84))
  trg_len = np.random.randint(2, 84, BATCH_SIZE)

  fd = {
      transformer.wd_ind_src: src_tensor,
      transformer.wd_ind_trg: trg_tensor,
      transformer.input_lengths: src_len,
      transformer.output_lengths: trg_len,
      transformer.learning_rate: 1e-2}
  sess.run([transformer.optimizer, transformer.loss], fd)
# In[75]:


Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


A Jupyter Widget

47.930622
33.614502
14.216359
18.626587
17.525478
24.581465
25.968618
14.263522
8.074388
11.152814
18.03378
15.32723
5.608718
11.72887
26.028437
14.476787
14.712259
10.991021
8.647961
10.481396
8.405534
12.700581
14.166832
9.821992
7.910573
8.485151
11.860352
4.2346144
8.631447
2.6892684
9.653509
18.76727
9.368668
11.056267
13.100941
8.037134
15.125847
10.582115
16.906372
2.3313324
5.1178827
7.538783
8.966838
12.657561
7.702527
6.9716105
4.049512
2.5055974
6.291495
10.37319
10.581086
0.46457887
6.902739
5.689904
1.2526518
10.877995
3.411272
9.721558
8.425287
5.111947
8.381946
6.4392414
0.18951607
4.838208
3.969192
14.225658
7.702443
11.590886
11.159121
6.676215
3.6075
7.276211
5.249755
7.1061506
1.9857178
10.056345
2.1317787
3.2015173
0.12055974
1.039646
7.937421
6.1146593
2.1444962
6.244997
0.515703
6.101731
0.32915562
10.993107
4.8111
4.4215097
11.3398075
4.270981
6.364236
0.06271805
6.178832
9.667063
2.6471107
4.4717617
5.0477414
4.650506
2.4390576
8.842626
5.2960935
3.2419555
2.460

In [2]:
NUM_MAX_DECODE = MAX_LEN

In [14]:

src_len_decode = src_len[0:1]
src_decode = src_tensor[0:1, :src_len_decode[0]]
autoregressive = trg_tensor[0:1, 0:1]


In [15]:
def detect_end(choice, eos_token=None):
    return choice.flatten()[0] == eos_token

In [16]:
np.ones(autoregressive.shape[1])

array([1.])

In [17]:
for dec_idx in range(1):
    pred = sess.run(
        transformer.decoder.outsoftmax.output[:,-1,:], {
            transformer.wd_ind_src: src_decode,
            transformer.input_lengths: src_len_decode,
            transformer.wd_ind_trg: autoregressive,
            transformer.output_lengths: np.ones(1)*autoregressive.shape[1],
        })
    choice = pred.argmax(1)
    autoregressive = np.concatenate((autoregressive, np.expand_dims(choice, 0)), -1)
    if detect_end(choice, None):
        break

In [18]:
translation = [EN.vocab.itos[a] for a in autoregressive.flatten()]

In [19]:
pred[:,4:9]

array([[0.00026378, 0.00058247, 0.00080827, 0.00556759, 0.00058463]],
      dtype=float32)

In [9]:
np.ones(autoregressive.shape[1]).shape

(6,)