In [1]:
# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl numpy matplotlib torchtext 

In [3]:
# Standard PyTorch imports
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy
from torch.autograd import Variable

# For plots
%matplotlib inline
import matplotlib.pyplot as plt


import tensorflow as tf

#!conda install torchtext spacy
!python -m spacy download en
!python -m spacy download de

from torchtext import data
from torchtext import datasets

import re
import spacy

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

url = re.compile('(<url>.*</url>)')


def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]


def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]


# Testing IWSLT
DE = data.Field(tokenize=tokenize_de, init_token='<bos>', eos_token='<eos>', include_lengths=True)
EN = data.Field(tokenize=tokenize_en, init_token='<bos>', eos_token='<eos>', include_lengths=True)

train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN))


train_it = data.Iterator(train, batch_size=4, sort_within_batch=True, train=True, repeat=False, shuffle=True)
MIN_WORD_FREQ = 10
MAX_NUM_WORDS = 1000
DE.build_vocab(train.src, min_freq=MIN_WORD_FREQ, max_size=MAX_NUM_WORDS)
EN.build_vocab(train.trg, min_freq=MIN_WORD_FREQ, max_size=MAX_NUM_WORDS)

DE.vocab.itos[:7]


[93m    Linking successful[0m
    /home/lee/anaconda3/lib/python3.6/site-packages/en_core_web_sm -->
    /home/lee/anaconda3/lib/python3.6/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')

Collecting de_core_news_sm==2.0.0 from https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.0.0/de_core_news_sm-2.0.0.tar.gz#egg=de_core_news_sm==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.0.0/de_core_news_sm-2.0.0.tar.gz (38.2MB)
[K    100% |████████████████████████████████| 38.2MB 1.0MB/s ta 0:00:0101 15% |████▉                           | 5.8MB 1.2MB/s eta 0:00:28
[?25hInstalling collected packages: de-core-news-sm
  Running setup.py install for de-core-news-sm ... [?25ldone
[?25hSuccessfully installed de-core-news-sm-2.0.0

[93m    Linking successful[0m
    /home/lee/anaconda3/lib/python3.6/site-packages/de_core_news_sm -->
    /home/lee/anaconda3/lib/python3.6/site-packa

['<unk>', '<pad>', '<bos>', '<eos>', ',', '.', 'die']

In [4]:
len(DE.vocab.itos)

1004

In [5]:
X = tf.ones((5, 4, 3))

In [6]:
xflat = tf.reshape(X, (-1, 3))

In [7]:
xflat

<tf.Tensor 'Reshape:0' shape=(20, 3) dtype=float32>

In [8]:
xflat.shape[-1].value

3

In [9]:
bs, length, ndims = [v.value for v in X.shape]


In [10]:
bs

5

In [11]:
q, k, v = [tf.layers.dense(X, 3) for _ in range(3)]

In [12]:
q_expanded = tf.expand_dims(q, 1)
k_expanded = tf.expand_dims(k, 2)

In [13]:
q_expanded.shape, k_expanded.shape

(TensorShape([Dimension(5), Dimension(1), Dimension(4), Dimension(3)]),
 TensorShape([Dimension(5), Dimension(4), Dimension(1), Dimension(3)]))

In [14]:
tf.reduce_sum(q_expanded * k_expanded, -1).shape

TensorShape([Dimension(5), Dimension(4), Dimension(4)])

In [15]:
s_raw = tf.reduce_sum(q_expanded * k_expanded, -1)

In [16]:
s = tf.expand_dims(tf.nn.softmax(s_raw, 1), -1)

In [17]:
v_expanded = tf.expand_dims(v, 1)

In [18]:
v_expanded.shape

TensorShape([Dimension(5), Dimension(1), Dimension(4), Dimension(3)])

In [19]:
s.shape

TensorShape([Dimension(5), Dimension(4), Dimension(4), Dimension(1)])

In [20]:
a = tf.reduce_sum(v_expanded * s, 1)

In [21]:
a.shape

TensorShape([Dimension(5), Dimension(4), Dimension(3)])

In [22]:
from tensorflow.contrib.layers import layer_norm

In [23]:
tf.reduce_max(a, (1, 2))

<tf.Tensor 'Max:0' shape=(5,) dtype=float32>

In [38]:
import pdb
from nn_utils import *

def masked_softmax(v, mask, dim=1):
    v_masked = v * mask
    v_max = tf.reshape(tf.reduce_max(v_masked, (1, 2)), (-1, 1, 1))
    v_stable = v_masked - v_max
    v_exp = tf.exp(v_stable)
    v_exp_masked = v_exp * mask
    v_exp_summed = tf.expand_dims(tf.reduce_sum(v_exp_masked, dim), dim)
    
    return v_exp_masked / v_exp_summed
    
    
class AttentionLayer:
    def __init__(self, X, mask, X_decode = None, decode_mask = None, ff_layer = True):
        #If X_decode is not none, this is the decoder module that takes in two embeddings
        #Otherwise, this is the standard self-attention layer
        bs, length, ndims = [v.value for v in X.shape]
        if X_decode is None:
            self.q, self.k, self.v = [tf.tanh(tf.layers.dense(X, ndims)) for _ in range(3)]
        else:
            self.k, self.v = [tf.tanh(tf.layers.dense(X, ndims)) for _ in range(2)]
            self.q = tf.tanh(tf.layers.dense(X_decode, ndims))
        #dimensions are batch, attn head, attn tail, emb
        self.q_expanded = tf.expand_dims(self.q, 1)
        self.k_expanded = tf.expand_dims(self.k, 2)
        self.s_raw = tf.reduce_sum(self.q_expanded * self.k_expanded, -1)
        enc_mask = tf.expand_dims(mask, 1)
        if decode_mask is None:
            dec_mask = tf.expand_dims(mask, 2)
        else:
            dec_mask = tf.expand_dims(decode_mask, 2)
        self.combined_mask = combined_mask = enc_mask * dec_mask
        self.s = masked_softmax(self.s_raw, combined_mask, dim=1)
        self.v_expanded = tf.expand_dims(self.v, 1)
        self.a = tf.reduce_sum(self.v_expanded * self.s, 2)
        if X_decode is None:
            residual_identity = X
        else:
            residual_identity = X_decode
        self.e_raw = self.a + residual_identity
        self.e = layer_norm(self.e_raw)
        if ff_layer:
            self.e_tilde_raw = tf.layers.dense(X, X.shape[-1].value) + X
            self.e_tilde = layer_norm(self.e_tilde_raw)
            self.output = self.e_tilde
        else:
            self.output = self.e
class Encoder:
    def __init__(self, num_wds, wd_ind, mask, ndims = 20, n_layers = 2):
        self.wd_ind = wd_ind
        self.num_wds = num_wds
        self.mask = mask
        self.wd_emb = tf.Variable(
            tf.random_uniform([self.num_wds, ndims],minval = -1, maxval = 1.))
        self.length = tf.shape(self.wd_ind)[1]
        self.wd_vec = tf.nn.embedding_lookup(self.wd_emb, wd_ind)
        self.position = tf.reshape(tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32)/10000, (1, -1, 1))
        
        # for debugging purposes
        self.encodings = []
        #self.attn_layers = []
        last_encoding = self.wd_vec + self.position
        for _ in range(n_layers):
            attn_layer = AttentionLayer(last_encoding, mask)
            last_encoding = attn_layer.output
            #last_encoding = LinearResNorm(attn_layer.e)
            #self.attn_layers.append(attn_layer)
            self.encodings.append(last_encoding)
        
        
class Decoder:
    def __init__(self, num_wds, wd_ind, mask, encoder, ndims = 20, n_layers = 2):
        self.encoder = encoder
        self.encodings = encoder.encodings
        input_mask = encoder.mask
        self.num_wds = num_wds
        self.wd_ind = wd_ind
        self.length = tf.shape(self.wd_ind)[1]
        self.num_wds = num_wds
        self.wd_emb = tf.Variable(
            tf.random_uniform([self.num_wds, ndims],minval = -1, maxval = 1.))
        self.wd_vec = tf.nn.embedding_lookup(self.wd_emb, wd_ind)
        self.position = tf.reshape(tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32)/10000, (1, -1, 1))
        last_encoding =last_encoding = self.wd_vec + self.position
        self.self_attentions = []
        self.second_attentions = []
        for idx in range(n_layers):
            encodings = self.encodings[idx]
            self_attention = AttentionLayer(last_encoding, mask, ff_layer = False).output
            second_attention = AttentionLayer(encodings, input_mask, X_decode = self_attention,
                                             decode_mask = mask).output
            last_encoding = second_attention
            self.self_attentions.append(self_attention)
            self.second_attention.append(second_attention)
            #linear_res_norm = LinearResNorm(attn_layer.e)
            
        self.presoftmax_output = tf.layers.dense(last_encoding, num_wds)
        #self.output = tf.nn.softmax(self.presoftmax_output)
        self.output = masked_softmax(self.presoftmax_output, mask, dim=1)



class Transformer:
    def __init__(self, num_wds):
        self.num_wds = num_wds
        self.learning_rate = tf.placeholder(tf.float32, ())
        self.wd_ind_src = wd_ind_src = tf.placeholder(tf.int32, (None, None))
        self.wd_ind_trg = wd_ind_trg = tf.placeholder(tf.int32, (None, None))
        self.input_lengths = tf.placeholder(tf.int32, [None])
        self.output_lengths = tf.placeholder(tf.int32, [None])
        self.input_mask = tf.sequence_mask(
            self.input_lengths, maxlen = tf.shape(self.wd_ind_src)[-1], dtype = tf.float32)
        self.output_mask = tf.sequence_mask(
            self.output_lengths, maxlen = tf.shape(self.wd_ind_trg)[-1], dtype = tf.float32)
        self.encoder = Encoder(num_wds, wd_ind_src, self.input_mask)
        self.decoder = Decoder(num_wds, wd_ind_trg, self.output_mask, self.encoder)
        self.presoftmax_output = self.decoder.presoftmax_output
        self.output = self.decoder.output
        opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        loss = tf.nn.softmax_cross_entropy_with_logits_v2(
            labels = self.wd_ind_trg, logits = self.presoftmax_output)
        self.optimizer, self.grad_norm_total = apply_clipped_optimizer(opt, loss)

In [39]:
transfomer = Transformer(20)

> <ipython-input-38-38e0ca99934a>(114)__init__()
-> self.input_mask = tf.sequence_mask(
(Pdb) l
109  	        self.wd_ind_src = wd_ind_src = tf.placeholder(tf.int32, (None, None))
110  	        self.wd_ind_trg = wd_ind_trg = tf.placeholder(tf.int32, (None, None))
111  	        self.input_lengths = tf.placeholder(tf.int32, (None))
112  	        self.output_lengths = tf.placeholder(tf.int32, (None))
113  	        pdb.set_trace()
114  ->	        self.input_mask = tf.sequence_mask(
115  	            self.input_lengths, maxlen = tf.shape(self.wd_ind_src)[-1], dtype = tf.float32)
116  	        self.output_mask = tf.sequence_mask(
117  	            self.output_lengths, maxlen = tf.shape(self.wd_ind_trg)[-1], dtype = tf.float32)
118  	        self.encoder = Encoder(num_wds, wd_ind_src, self.input_mask)
119  	        self.decoder = Decoder(num_wds, wd_ind_trg, self.output_mask, self.encoder)
(Pdb) tf.sequence_mask(self.input_lengths, maxlen = tf.shape(self.wd_ind_src)[-1], dtype = tf.float32)
<

BdbQuit: 

In [None]:
enc.encodings

In [None]:
for train_batch in train_it:
    src_tensor  = train_batch.src[0].data.cpu().numpy()
    src_len = train_batch.src[1].cpu().numpy()
    trg_tensor  = train_batch.trg[0].data.cpu().numpy()
    trg_len = train_batch.trg[1].cpu().numpy()
    print(src_tensor.shape, src_len.shape, trg_tensor.shape, trg_len.shape)
    print(src_tensor, src_len, trg_tensor, trg_len)
    return

In [None]:
train_batch

In [None]:
train_batch.src[0].data

In [None]:
from torchtext.datasets import WMT14

In [None]:
WMT14('data/', ('.en', '.de'))

In [None]:
# Load words from IWSLT

#!pip install torchtext spacy
#!python -m spacy download en
#!python -m spacy download de
from torchtext import data, datasets
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"
SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
TGT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD, 
                 eos_token = EOS_WORD, pad_token=BLANK_WORD)

MAX_LEN = 100
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(SRC, TGT), 
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                         len(vars(x)['trg']) <= MAX_LEN)
MIN_FREQ = 1
SRC.build_vocab(train.src, min_freq=MIN_FREQ)
TGT.build_vocab(train.trg, min_freq=MIN_FREQ)

In [None]:
dir(train)

In [None]:
!pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl numpy matplotlib spacy torchtext seaborn 