In [1]:
# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl numpy matplotlib torchtext 

In [3]:
# Standard PyTorch imports
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy
from torch.autograd import Variable

# For plots
%matplotlib inline
import matplotlib.pyplot as plt


import tensorflow as tf

#!conda install torchtext spacy
# !python -m spacy download en
# !python -m spacy download de

from torchtext import data
from torchtext import datasets

import re
import spacy

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

url = re.compile('(<url>.*</url>)')


def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]


def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]


# Testing IWSLT
DE = data.Field(tokenize=tokenize_de, init_token='<bos>', eos_token='<eos>', include_lengths=True)
EN = data.Field(tokenize=tokenize_en, init_token='<bos>', eos_token='<eos>', include_lengths=True)

train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN))


train_it = data.Iterator(train, batch_size=4, sort_within_batch=True, train=True, repeat=False, shuffle=True)
MIN_WORD_FREQ = 10
MAX_NUM_WORDS = 1000
DE.build_vocab(train.src, min_freq=MIN_WORD_FREQ, max_size=MAX_NUM_WORDS)
EN.build_vocab(train.trg, min_freq=MIN_WORD_FREQ, max_size=MAX_NUM_WORDS)

num_wds_input = len(DE.vocab.itos)
num_wds_output = len(EN.vocab.itos)

num_wds_input, num_wds_output



(1004, 1004)

In [7]:

from tensorflow.contrib.layers import layer_norm
import nn_utils

In [196]:

class masked_softmax:
    def __init__(self, v, mask, dim=1):
        #bs, query dimension, key dimension
        v_mask = v * mask
        v_max = tf.reduce_max(v_mask, dim, keep_dims=True)
        v_stable = v_mask - v_max

        v_exp = tf.exp(v_stable) * mask
        v_exp_sum = tf.reduce_sum(v_exp, dim, keep_dims=True)
        self.v_mask, self.v_max, self.v_stable, self.v_exp, self.v_exp_sum = \
            v_mask, v_max, v_stable, v_exp, v_exp_sum
        self.output =  v_exp / (v_exp_sum + 1e-20)


class Encoder:
  def __init__(self, num_wds, wd_ind, mask, ndims=20, n_layers=6):
    self.num_wds = num_wds
    self.wd_ind = wd_ind
    self.mask = mask
    self.length = tf.shape(self.wd_ind)[1]
    self.wd_emb = tf.Variable(
        tf.random_uniform([self.num_wds, ndims], minval=-1, maxval=1.))
    self.wd_vec = tf.nn.embedding_lookup(self.wd_emb, wd_ind)
    self.pos = tf.reshape(
        tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32),
        (1, -1, 1))
    self.divider_exponent = tf.reshape(
        tf.range(tf.cast(ndims // 2, tf.float32)),
        (1, 1, -1)) * 2. / tf.cast(ndims, tf.float32)
    self.divider = tf.pow(10000., self.divider_exponent)
    self.input_to_sinusoids = self.pos / self.divider
    self.pos_sin = tf.sin(self.input_to_sinusoids)
    self.pos_cos = tf.cos(self.input_to_sinusoids)
    # self.position = tf.reshape(
    #     tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32) / 10000,
    #     (1, -1, 1))
    self.position = tf.concat((self.pos_sin, self.pos_cos), -1)
    self.w_tilde = embedding = self.wd_vec + self.position
    self.encoding = []
    self.attentionLayers = []
    for _ in range(n_layers):
      attentionLayer = AttentionLayer(embedding, mask)
      embedding = attentionLayer.output
      self.encoding.append(embedding)
      self.attentionLayers.append(attentionLayer)


class AttentionLayer:
  def __init__(self, X, mask, X_decode=None, decode_mask=None, ff_layer=True):
    bs, length, ndim = [v.value for v in X.shape]
    self.X = X
    if X_decode is None:
      self.q, self.k, self.v = [
          tf.tanh(tf.layers.dense(X, ndim)) for _ in range(3)
      ]
      decode_mask = mask
    else:
      self.q = tf.tanh(tf.layers.dense(X_decode, ndim))
      self.k, self.v = [tf.tanh(tf.layers.dense(X, ndim)) for _ in range(2)]
    #batch, attention queries, attention keys, embeddings
    self.q_expanded = tf.expand_dims(self.q, 2)
    self.k_expanded = tf.expand_dims(self.k, 1)
    self.v_expanded = tf.expand_dims(self.v, 1)
    self.s_raw = tf.reduce_sum(self.q_expanded * self.k_expanded, -1)
    self.mask = tf.expand_dims(decode_mask, 2) * tf.expand_dims(mask, 1)
    self.masked_softmax = masked_softmax(self.s_raw, self.mask)
    self.s = self.masked_softmax.output
    self.a = tf.expand_dims(self.s * self.mask, -1) * self.v_expanded
    #A is shape bs, query, key, emb
    self.a_compressed = tf.reduce_sum(self.a, 2)
    if X_decode is None:
        self.e = layer_norm(self.a_compressed + X)
    else:
        self.e = layer_norm(self.a_compressed + X_decode)
    if ff_layer:
      self.output = layer_norm(tf.layers.dense(self.e, ndim) + self.e)
    else:
      self.output = self.e


class Decoder:
  def __init__(self, num_wds, wd_ind, mask, encoder, ndims=20, n_layers=6):
    self.num_wds = num_wds
    self.wd_ind = wd_ind
    self.mask = mask
    self.encoder = encoder
    self.length = tf.shape(self.wd_ind)[1]
    self.wd_emb = tf.Variable(
        tf.random_uniform([self.num_wds, ndims], minval=-1, maxval=1.))
    self.wd_vec = tf.nn.embedding_lookup(self.wd_emb, wd_ind)
    self.pos = tf.reshape(
        tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32),
        (1, -1, 1))
    self.divider_exponent = tf.reshape(
        tf.range(tf.cast(ndims // 2, tf.float32)),
        (1, 1, -1)) * 2. / tf.cast(ndims, tf.float32)
    self.divider = tf.pow(10000., self.divider_exponent)
    self.input_to_sinusoids = self.pos / self.divider
    self.pos_sin = tf.sin(self.input_to_sinusoids)
    self.pos_cos = tf.cos(self.input_to_sinusoids)
    # self.position = tf.reshape(
    #     tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32) / 10000,
    #     (1, -1, 1))
    self.position = tf.concat((self.pos_sin, self.pos_cos), -1)
    self.w_tilde = embedding = self.wd_vec + self.position
    self.decoding = []
    self.self_attentions = []
    self.encoder_attentions = []
    for l_idx in range(n_layers):
      attn = AttentionLayer(embedding, mask, ff_layer=False)
      self.self_attentions.append(attn)
      encode_attn = AttentionLayer(encoder.encoding[l_idx], encoder.mask,
                                   attn.output, mask)
      self.encoder_attentions.append(encode_attn)
      embedding = encode_attn.output

    self.output_raw = tf.layers.dense(embedding, num_wds)
    #bs, word in sentence of target, embedding
    
    self.masked_softmax = masked_softmax(self.output_raw, mask)
    self.output = self.masked_softmax.output


class Transformer:
  def __init__(self, num_wds):
    self.num_wds = num_wds
    n_layers = 6
    ndims = 256
    self.learning_rate = tf.placeholder(tf.float32, None)
    self.wd_ind_src = wd_ind_src = tf.placeholder(tf.int32, (None, None))
    self.wd_ind_trg = wd_ind_trg = tf.placeholder(tf.int32, (None, None))
    self.input_lengths = tf.placeholder(tf.int32, [None])
    self.output_lengths = tf.placeholder(tf.int32, [None])
    self.input_mask = tf.sequence_mask(
        self.input_lengths,
        maxlen=tf.shape(self.wd_ind_src)[-1],
        dtype=tf.float32)
    self.output_mask = tf.sequence_mask(
        self.output_lengths,
        maxlen=tf.shape(self.wd_ind_trg)[-1],
        dtype=tf.float32)
    self.encoder = Encoder(num_wds, wd_ind_src, self.input_mask, n_layers = n_layers, ndims=ndims)
    self.decoder = Decoder(num_wds, wd_ind_trg, self.output_mask, self.encoder, n_layers = n_layers, ndims=ndims)
    opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
    self.prediction_mask = tf.concat((tf.zeros((4, 1)), self.output_mask[:,:-1] - self.output_mask[:,1:]), 1)
    self.loss = tf.reduce_mean(tf.reduce_max(tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=self.wd_ind_trg, logits=self.decoder.output_raw) * self.prediction_mask
      , 1))
    self.optimizer, self.grad_norm_total = nn_utils.apply_clipped_optimizer(
        opt, self.loss)



In [197]:
trg_len

array([48, 40,  5,  8])

In [199]:


transformer = Transformer(num_wds_input)


sess = tf.Session()
sess.run(tf.global_variables_initializer())
for itr, train_batch in enumerate(train_it):
    src_tensor  = train_batch.src[0].data.cpu().numpy().transpose()
    src_len = train_batch.src[1].cpu().numpy()
    trg_tensor  = train_batch.trg[0].data.cpu().numpy().transpose()
    trg_len = train_batch.trg[1].cpu().numpy()
    trg_len = np.ceil(np.random.uniform(size=4)*(trg_len-1)).astype(int)
#     print(src_tensor.shape, src_len.shape, trg_tensor.shape, trg_len.shape)
#     print(src_tensor, src_len, trg_tensor, trg_len)
    trn_feed_dict = {transformer.wd_ind_src : src_tensor, transformer.input_lengths : src_len,
                    transformer.wd_ind_trg : trg_tensor, transformer.output_lengths : trg_len,
                    transformer.learning_rate : 1e-1/(np.sqrt(itr+10))}
    _,loss = sess.run([transformer.optimizer, transformer.loss], trn_feed_dict)
    if itr % 100 == 0:
        print(loss)


7.0972548
5.9469333
4.9471283
7.047677
4.637475
6.303015
5.666936
6.516633
7.1590414
5.4799085
3.4238195
4.277014
6.4966764
3.2230341
3.9685812
6.272038
9.040979
4.326608
6.0201874
4.604782
4.29444
4.8202276
4.115734
3.4810133
3.1874223
4.9142246
4.766762
5.8186984
2.026723
3.9846857
3.6307006
5.0194387
6.3504825
4.1456685
8.385155
4.372853
2.950831
4.0811396
3.2376623
4.899851
3.9161477
4.7154045
3.677544
3.348478
2.8299334
4.38752
2.5894117
3.9824686
3.0956092
3.8206644
3.0641854
4.551424
2.7172935
4.2081766
8.51865
7.632625
3.4116654
2.8299136
5.4199424
3.0794282
3.5242367
6.071144
3.2373786
5.4268823
4.7229548
3.1467752
2.0591443
4.6689434
7.292213
4.0677
2.1823807
4.8974495
4.9532795
1.698641
4.3734727
3.5681353
2.4968057
5.6583047
5.79937
4.279877
3.3856957
4.1305046
3.8604875
3.7507594
3.163558
3.2907314
2.750214
1.7517648
2.8162913
2.6895542
5.3631077
7.093939
2.7522635
3.5479999
3.337389
4.8492737
5.5955076
5.0379634
4.9813604
4.575586
4.8848042
3.5414953


ResourceExhaustedError: OOM when allocating tensor with shape[4,727,727,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: gradients_28/mul_911_grad/Mul_1 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](ExpandDims_869, gradients_28/Sum_1943_grad/Tile)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'gradients_28/mul_911_grad/Mul_1', defined at:
  File "/home/lee/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/lee/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/lee/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/lee/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 478, in start
    self.io_loop.start()
  File "/home/lee/anaconda3/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/lee/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/lee/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-199-98d662d7a611>", line 3, in <module>
    transformer = Transformer(num_wds_input)
  File "<ipython-input-196-845265493e7b>", line 128, in __init__
    opt, self.loss)
  File "/home/lee/workspace/transformer/nn_utils.py", line 35, in apply_clipped_optimizer
    gvs = opt_fcn.compute_gradients(loss)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/optimizer.py", line 514, in compute_gradients
    colocate_gradients_with_ops=colocate_gradients_with_ops)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 596, in gradients
    gate_gradients, aggregation_method, stop_gradients)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 779, in _GradientsHelper
    lambda: grad_fn(op, *out_grads))
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 398, in _MaybeCompile
    return grad_fn()  # Exit early
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 779, in <lambda>
    lambda: grad_fn(op, *out_grads))
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py", line 912, in _MulGrad
    math_ops.reduce_sum(gen_math_ops.mul(x, grad), ry), sy))
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 4936, in mul
    "Mul", x=x, y=y, name=name)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 454, in new_func
    return func(*args, **kwargs)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3155, in create_op
    op_def=op_def)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1717, in __init__
    self._traceback = tf_stack.extract_stack()

...which was originally created as op 'mul_911', defined at:
  File "/home/lee/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
[elided 19 identical lines from previous traceback]
  File "<ipython-input-199-98d662d7a611>", line 3, in <module>
    transformer = Transformer(num_wds_input)
  File "<ipython-input-196-845265493e7b>", line 121, in __init__
    self.decoder = Decoder(num_wds, wd_ind_trg, self.output_mask, self.encoder, n_layers = n_layers, ndims=ndims)
  File "<ipython-input-196-845265493e7b>", line 88, in __init__
    attn = AttentionLayer(embedding, mask, ff_layer=False)
  File "<ipython-input-196-845265493e7b>", line 57, in __init__
    self.a = tf.expand_dims(self.s * self.mask, -1) * self.v_expanded
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 850, in binary_op_wrapper
    return func(x, y, name=name)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 1094, in _mul_dispatch
    return gen_math_ops.mul(x, y, name=name)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 4936, in mul
    "Mul", x=x, y=y, name=name)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 454, in new_func
    return func(*args, **kwargs)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3155, in create_op
    op_def=op_def)
  File "/home/lee/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1717, in __init__
    self._traceback = tf_stack.extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[4,727,727,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: gradients_28/mul_911_grad/Mul_1 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](ExpandDims_869, gradients_28/Sum_1943_grad/Tile)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.



In [202]:
sess.run(tf.sin(180.))

-0.80115265

In [182]:
trg_len

[1, 2, 3, 4]

In [183]:
sess.run(transformer.prediction_mask, trn_feed_dict)

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [156]:
trg_len

array([14, 20, 13, 14])

In [133]:
self = transformer

In [134]:
s, m = sess.run([self.encoder.attentionLayers[0].s, self.encoder.attentionLayers[0].mask], trn_feed_dict)

In [129]:
s[-1,-1,-1]

nan

In [130]:
m[-1,-1,-1]

0.0

In [128]:
(s * m)[-1, -1, -1]

nan

In [83]:
sess.run(self.wd_ind_src, trn_feed_dict).shape

(4, 33)

In [81]:
sess.run(self.encoder.attentionLayers[0].masked_softmax.v_exp_sum, trn_feed_dict).shape

(4, 1, 33)

In [90]:
sess.run(self.encoder.attentionLayers[0].masked_softmax.output, trn_feed_dict).shape

(4, 33, 33)

In [36]:
sess.run(tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=self.wd_ind_trg, logits=self.decoder.output_raw) * self.output_mask, trn_feed_dict)

array([[6.8806496, 7.2538576, 6.836245 , 6.8354425, 6.8945065, 7.029778 ,
        7.261009 , 7.1071267, 7.1146455, 6.8747187, 6.6536813, 7.0499544,
        7.261133 , 7.0428286, 7.2384424, 6.873402 , 7.1073694, 7.1073966,
        6.6534758, 7.0587964, 6.6961355, 6.5816846, 6.873296 , 7.0611773,
        6.934389 , 7.4266753, 6.873235 , 7.107667 , 7.1076937, 7.2384634,
        6.8879128, 7.261527 , 6.8138814, 7.107829 , 7.107856 , 7.2384715,
        6.873083 , 7.107937 , 7.107964 , 7.238476 , 7.0586863, 7.1080456,
        6.871643 , 6.872977 , 7.1081266, 6.7329335, 7.104479 ],
       [      nan,       nan,       nan,       nan,       nan,       nan,
              nan,       nan,       nan,       nan,       nan,       nan,
              nan,       nan,       nan,       nan,       nan,       nan,
              nan,       nan,       nan,       nan,       nan,       nan,
              nan,       nan,       nan,       nan,       nan,       nan,
              nan,       nan,       nan,       n