In [1]:
# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl numpy matplotlib torchtext 

In [3]:
# Standard PyTorch imports
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy
from torch.autograd import Variable

# For plots
%matplotlib inline
import matplotlib.pyplot as plt


import tensorflow as tf

#!conda install torchtext spacy
# !python -m spacy download en
# !python -m spacy download de

from torchtext import data
from torchtext import datasets

import re
import spacy

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

url = re.compile('(<url>.*</url>)')


def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]


def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]


# Testing IWSLT
DE = data.Field(tokenize=tokenize_de, init_token='<bos>', eos_token='<eos>', include_lengths=True)
EN = data.Field(tokenize=tokenize_en, init_token='<bos>', eos_token='<eos>', include_lengths=True)

train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN))


train_it = data.Iterator(train, batch_size=4, sort_within_batch=True, train=True, repeat=False, shuffle=True)
MIN_WORD_FREQ = 10
MAX_NUM_WORDS = 1000
DE.build_vocab(train.src, min_freq=MIN_WORD_FREQ, max_size=MAX_NUM_WORDS)
EN.build_vocab(train.trg, min_freq=MIN_WORD_FREQ, max_size=MAX_NUM_WORDS)

num_wds_input = len(DE.vocab.itos)
num_wds_output = len(EN.vocab.itos)

num_wds_input, num_wds_output



(1004, 1004)

In [7]:

from tensorflow.contrib.layers import layer_norm
import nn_utils

In [185]:

class masked_softmax:
    def __init__(self, v, mask, dim=1):
        #bs, query dimension, key dimension
        v_mask = v * mask
        v_max = tf.reduce_max(v_mask, dim, keep_dims=True)
        v_stable = v_mask - v_max

        v_exp = tf.exp(v_stable) * mask
        v_exp_sum = tf.reduce_sum(v_exp, dim, keep_dims=True)
        self.v_mask, self.v_max, self.v_stable, self.v_exp, self.v_exp_sum = \
            v_mask, v_max, v_stable, v_exp, v_exp_sum
        self.output =  v_exp / (v_exp_sum + 1e-20)


class Encoder:
  def __init__(self, num_wds, wd_ind, mask, ndims=20, n_layers=1):
    self.num_wds = num_wds
    self.wd_ind = wd_ind
    self.mask = mask
    self.length = tf.shape(self.wd_ind)[1]
    self.wd_emb = tf.Variable(
        tf.random_uniform([self.num_wds, ndims], minval=-1, maxval=1.))
    self.wd_vec = tf.nn.embedding_lookup(self.wd_emb, wd_ind)
    self.position = tf.reshape(
        tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32) / 10000,
        (1, -1, 1))
    self.w_tilde = embedding = self.wd_vec + self.position
    self.encoding = []
    self.attentionLayers = []
    for _ in range(n_layers):
      attentionLayer = AttentionLayer(embedding, mask)
      embedding = attentionLayer.output
      self.encoding.append(embedding)
      self.attentionLayers.append(attentionLayer)


class AttentionLayer:
  def __init__(self, X, mask, X_decode=None, decode_mask=None, ff_layer=True):
    bs, length, ndim = [v.value for v in X.shape]
    self.X = X
    if X_decode is None:
      self.q, self.k, self.v = [
          tf.tanh(tf.layers.dense(X, ndim)) for _ in range(3)
      ]
      decode_mask = mask
    else:
      self.q = tf.tanh(tf.layers.dense(X_decode, ndim))
      self.k, self.v = [tf.tanh(tf.layers.dense(X, ndim)) for _ in range(2)]
    #batch, attention queries, attention keys, embeddings
    self.q_expanded = tf.expand_dims(self.q, 2)
    self.k_expanded = tf.expand_dims(self.k, 1)
    self.v_expanded = tf.expand_dims(self.v, 1)
    self.s_raw = tf.reduce_sum(self.q_expanded * self.k_expanded, -1)
    self.mask = tf.expand_dims(decode_mask, 2) * tf.expand_dims(mask, 1)
    self.masked_softmax = masked_softmax(self.s_raw, self.mask)
    self.s = self.masked_softmax.output
    self.a = tf.expand_dims(self.s * self.mask, -1) * self.v_expanded
    #A is shape bs, query, key, emb
    self.a_compressed = tf.reduce_sum(self.a, 2)
    if X_decode is None:
        self.e = layer_norm(self.a_compressed + X)
    else:
        self.e = layer_norm(self.a_compressed + X_decode)
    if ff_layer:
      self.output = layer_norm(tf.layers.dense(self.e, ndim) + self.e)
    else:
      self.output = self.e


class Decoder:
  def __init__(self, num_wds, wd_ind, mask, encoder, ndims=20, n_layers=1):
    self.num_wds = num_wds
    self.wd_ind = wd_ind
    self.mask = mask
    self.encoder = encoder
    self.length = tf.shape(self.wd_ind)[1]
    self.wd_emb = tf.Variable(
        tf.random_uniform([self.num_wds, ndims], minval=-1, maxval=1.))
    self.wd_vec = tf.nn.embedding_lookup(self.wd_emb, wd_ind)
    self.position = tf.reshape(
        tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32) / 10000,
        (1, -1, 1))
    self.w_tilde = embedding = self.wd_vec + self.position
    self.decoding = []
    self.self_attentions = []
    self.encoder_attentions = []
    for l_idx in range(n_layers):
      attn = AttentionLayer(embedding, mask, ff_layer=False)
      self.self_attentions.append(attn)
      encode_attn = AttentionLayer(encoder.encoding[l_idx], encoder.mask,
                                   attn.output, mask)
      self.encoder_attentions.append(encode_attn)
      embedding = encode_attn.output

    self.output_raw = tf.layers.dense(embedding, num_wds)
    #bs, word in sentence of target, embedding
    
    self.masked_softmax = masked_softmax(self.output_raw, mask)
    self.output = self.masked_softmax.output


class Transformer:
  def __init__(self, num_wds):
    self.num_wds = num_wds
    self.learning_rate = tf.placeholder(tf.float32, None)
    self.wd_ind_src = wd_ind_src = tf.placeholder(tf.int32, (None, None))
    self.wd_ind_trg = wd_ind_trg = tf.placeholder(tf.int32, (None, None))
    self.input_lengths = tf.placeholder(tf.int32, [None])
    self.output_lengths = tf.placeholder(tf.int32, [None])
    self.input_mask = tf.sequence_mask(
        self.input_lengths,
        maxlen=tf.shape(self.wd_ind_src)[-1],
        dtype=tf.float32)
    self.output_mask = tf.sequence_mask(
        self.output_lengths,
        maxlen=tf.shape(self.wd_ind_trg)[-1],
        dtype=tf.float32)
    self.encoder = Encoder(num_wds, wd_ind_src, self.input_mask)
    self.decoder = Decoder(num_wds, wd_ind_trg, self.output_mask, self.encoder)
    opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
    self.prediction_mask = tf.concat((tf.zeros((4, 1)), self.output_mask[:,:-1] - self.output_mask[:,1:]), 1)
    self.loss = tf.reduce_mean(tf.reduce_max(tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=self.wd_ind_trg, logits=self.decoder.output_raw) * self.prediction_mask
      , 1))
    self.optimizer, self.grad_norm_total = nn_utils.apply_clipped_optimizer(
        opt, self.loss)



In [186]:
trg_len

[1, 2, 3, 4]

In [187]:


transformer = Transformer(num_wds_input)


sess = tf.Session()
sess.run(tf.global_variables_initializer())
for itr, train_batch in enumerate(train_it):
    src_tensor  = train_batch.src[0].data.cpu().numpy().transpose()
    src_len = train_batch.src[1].cpu().numpy()
    trg_tensor  = train_batch.trg[0].data.cpu().numpy().transpose()
    trg_len = train_batch.trg[1].cpu().numpy()
    trg_len = np.ceil(np.random.uniform(size=4)*(trg_len-1)).astype(int)
    trg_len = [1, 2, 3, 4]
#     print(src_tensor.shape, src_len.shape, trg_tensor.shape, trg_len.shape)
#     print(src_tensor, src_len, trg_tensor, trg_len)
    trn_feed_dict = {transformer.wd_ind_src : src_tensor, transformer.input_lengths : src_len,
                    transformer.wd_ind_trg : trg_tensor, transformer.output_lengths : trg_len,
                    transformer.learning_rate : 1e-2}
    _,loss = sess.run([transformer.optimizer, transformer.loss], trn_feed_dict)
    if itr % 100 == 0:
        print(loss)


6.860581
6.8824887
6.848736
6.8816223
6.452512
6.4300604
6.627413
6.5195527
6.6538987
6.570581
6.151683
5.1754093
5.641946
6.77978
3.501072
6.6031303
4.5910125
6.240979
6.0886965
6.618112
5.8758736
4.112165
5.871326
4.4997735
3.837925
7.826671
7.142421
3.970683
6.320285
6.3783855
4.25441
7.2489214
0.8430574
6.6169786
7.1854715
3.38065
4.4561653
4.8446665
4.2707424
4.2078886
3.0726619
6.6337357
2.455357
6.7401123
7.748409
3.767892
5.216974
6.6054926
2.642487
5.8652964
3.8745844
3.2137039
5.069334
5.487368
3.9541762
3.7334604
5.7096324
2.7235742
5.393903
6.703521
5.0366616
4.7399964
4.2854013
2.4378104
3.316927
2.82351
6.2732267
2.7403042
4.5424943
3.274647
3.0037863
5.1188736
3.0483043
4.6863527
4.045103
5.707782
1.4244723
6.1881843
3.3271418
3.8717227
5.4050965
3.4128418
1.3608876
0.69700176
2.3570175
5.038023
1.9138532
2.313255
3.6385365
4.1894193
7.2299967
5.0545893
4.1882067
5.243911
5.8925
2.3702826
4.5857315
4.4238787
5.096237
0.8655406
2.8627644
3.630424
1.1195713
2.9816017
3.211

KeyboardInterrupt: 

In [182]:
trg_len

[1, 2, 3, 4]

In [183]:
sess.run(transformer.prediction_mask, trn_feed_dict)

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [156]:
trg_len

array([14, 20, 13, 14])

In [133]:
self = transformer

In [134]:
s, m = sess.run([self.encoder.attentionLayers[0].s, self.encoder.attentionLayers[0].mask], trn_feed_dict)

In [129]:
s[-1,-1,-1]

nan

In [130]:
m[-1,-1,-1]

0.0

In [128]:
(s * m)[-1, -1, -1]

nan

In [83]:
sess.run(self.wd_ind_src, trn_feed_dict).shape

(4, 33)

In [81]:
sess.run(self.encoder.attentionLayers[0].masked_softmax.v_exp_sum, trn_feed_dict).shape

(4, 1, 33)

In [90]:
sess.run(self.encoder.attentionLayers[0].masked_softmax.output, trn_feed_dict).shape

(4, 33, 33)

In [36]:
sess.run(tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=self.wd_ind_trg, logits=self.decoder.output_raw) * self.output_mask, trn_feed_dict)

array([[6.8806496, 7.2538576, 6.836245 , 6.8354425, 6.8945065, 7.029778 ,
        7.261009 , 7.1071267, 7.1146455, 6.8747187, 6.6536813, 7.0499544,
        7.261133 , 7.0428286, 7.2384424, 6.873402 , 7.1073694, 7.1073966,
        6.6534758, 7.0587964, 6.6961355, 6.5816846, 6.873296 , 7.0611773,
        6.934389 , 7.4266753, 6.873235 , 7.107667 , 7.1076937, 7.2384634,
        6.8879128, 7.261527 , 6.8138814, 7.107829 , 7.107856 , 7.2384715,
        6.873083 , 7.107937 , 7.107964 , 7.238476 , 7.0586863, 7.1080456,
        6.871643 , 6.872977 , 7.1081266, 6.7329335, 7.104479 ],
       [      nan,       nan,       nan,       nan,       nan,       nan,
              nan,       nan,       nan,       nan,       nan,       nan,
              nan,       nan,       nan,       nan,       nan,       nan,
              nan,       nan,       nan,       nan,       nan,       nan,
              nan,       nan,       nan,       nan,       nan,       nan,
              nan,       nan,       nan,       n