In [1]:
#!pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl numpy matplotlib torchtext 

In [2]:
# Standard PyTorch imports
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy
from torch.autograd import Variable

# For plots
%matplotlib inline
import matplotlib.pyplot as plt


import tensorflow as tf

# !conda install torchtext spacy
# !python -m spacy download en
# !python -m spacy download fr

from torchtext import data
from torchtext import datasets

import re
import spacy

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

url = re.compile('(<url>.*</url>)')


def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]


def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]


# Testing IWSLT
DE = data.Field(tokenize=tokenize_de, init_token='<bos>', eos_token='<eos>', include_lengths=True)
EN = data.Field(tokenize=tokenize_en, init_token='<bos>', eos_token='<eos>', include_lengths=True)

train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN))


train_it = data.Iterator(train, batch_size=4, sort_within_batch=True, train=True, repeat=False, shuffle=True)
MIN_WORD_FREQ = 10
MAX_NUM_WORDS = 1000
DE.build_vocab(train.src, min_freq=MIN_WORD_FREQ, max_size=MAX_NUM_WORDS)
EN.build_vocab(train.trg, min_freq=MIN_WORD_FREQ, max_size=MAX_NUM_WORDS)

DE.vocab.itos[:7]

  from ._conv import register_converters as _register_converters


['<unk>', '<pad>', '<bos>', '<eos>', ',', '.', 'die']

In [3]:
len(DE.vocab.itos)

1004

In [4]:
X = tf.ones((5, 4, 3))

In [5]:
xflat = tf.reshape(X, (-1, 3))

In [6]:
xflat

<tf.Tensor 'Reshape:0' shape=(20, 3) dtype=float32>

In [7]:
xflat.shape[-1].value

3

In [8]:
bs, length, ndims = [v.value for v in X.shape]


In [9]:
bs

5

In [10]:
q, k, v = [tf.layers.dense(X, 3) for _ in range(3)]

In [11]:
q_expanded = tf.expand_dims(q, 1)
k_expanded = tf.expand_dims(k, 2)

In [12]:
q_expanded.shape, k_expanded.shape

(TensorShape([Dimension(5), Dimension(1), Dimension(4), Dimension(3)]),
 TensorShape([Dimension(5), Dimension(4), Dimension(1), Dimension(3)]))

In [13]:
tf.reduce_sum(q_expanded * k_expanded, -1).shape

TensorShape([Dimension(5), Dimension(4), Dimension(4)])

In [14]:
s_raw = tf.reduce_sum(q_expanded * k_expanded, -1)

In [15]:
s = tf.expand_dims(tf.nn.softmax(s_raw, 1), -1)

In [16]:
v_expanded = tf.expand_dims(v, 1)

In [17]:
v_expanded.shape

TensorShape([Dimension(5), Dimension(1), Dimension(4), Dimension(3)])

In [18]:
s.shape

TensorShape([Dimension(5), Dimension(4), Dimension(4), Dimension(1)])

In [19]:
a = tf.reduce_sum(v_expanded * s, 1)

In [20]:
a.shape

TensorShape([Dimension(5), Dimension(4), Dimension(3)])

In [21]:
from tensorflow.contrib.layers import layer_norm

In [28]:
import pdb
from nn_utils import *

def apply_clipped_optimizer(opt_fcn,
                            loss,
                            clip_norm=.1,
                            clip_single=.03,
                            clip_global_norm=False):
    gvs = opt_fcn.compute_gradients(loss)

    if clip_global_norm:
        gs, vs = zip(*[(g, v) for g, v in gvs if g is not None])
        capped_gs, grad_norm_total = tf.clip_by_global_norm([g for g in gs],
                                                            clip_norm)
        capped_gvs = list(zip(capped_gs, vs))
    else:
        grad_norm_total = tf.sqrt(
            tf.reduce_sum([
                tf.reduce_sum(tf.square(grad)) for grad, var in gvs
                if grad is not None
            ]))
        capped_gvs = [(tf.clip_by_value(grad, -1 * clip_single, clip_single),
                       var) for grad, var in gvs if grad is not None]
        capped_gvs = [(tf.clip_by_norm(grad, clip_norm), var)
                      for grad, var in capped_gvs if grad is not None]

    optimizer = opt_fcn.apply_gradients(
        capped_gvs, global_step=tf.train.get_global_step())

    return optimizer, grad_norm_total


# def LinearResNorm(X):
#     X = tf.layers.dense(X, X.shape[-1].value) + X
#     return layer_norm(X)
    
class AttentionLayer:
    def __init__(self, X, input_mask = mask, X_decode = None, ff_layer = True):
        #If X_decode is not none, this is the decoder module that takes in two embeddings
        #Otherwise, this is the standard self-attention layer
        bs, length, ndims = [v.value for v in X.shape]
        if X_decode is None:
            self.q, self.k, self.v = [tf.tanh(tf.layers.dense(X, ndims)) for _ in range(3)]
        else:
            self.q, self.k = [tf.tanh(tf.layers.dense(X, ndims)) for _ in range(2)]
            self.v = tf.tanh(tf.layers.dense(X_decode, ndims))
        #dimensions are batch, attn head, attn tail, emb
        self.q_expanded = tf.expand_dims(self.q, 1)
        self.k_expanded = tf.expand_dims(self.k, 2)
        self.s_raw = tf.reduce_sum(self.q_expanded * self.k_expanded, -1)
        self.s = tf.expand_dims(tf.nn.softmax(self.s_raw, 1), -1)
        self.v_expanded = tf.expand_dims(self.v, 1)
        self.a = tf.reduce_sum(self.v_expanded * self.s, 2)
        if X_decode is None:
            residual_identity = X
        else:
            residual_identity = X_decode
        self.e_raw = self.a + residual_identity
        self.e = layer_norm(self.e_raw)
        if ff_layer:
            self.e_tilde_raw = tf.layers.dense(X, X.shape[-1].value) + X
            self.e_tilde = layer_norm(self.e_tilde_raw)
            self.output = self.e_tilde
        else:
            self.output = self.e
class Encoder:
    def __init__(self, num_wds, wd_ind, mask, ndims = 20, n_layers = 2):
        self.wd_ind = wd_ind
        self.num_wds = num_wds
        self.wd_emb = tf.Variable(
            tf.random_uniform([self.num_wds, ndims],minval = -1, maxval = 1.))
        self.length = tf.shape(self.wd_ind)[1]
        self.wd_vec = tf.nn.embedding_lookup(self.wd_emb, wd_ind)
        self.position = tf.reshape(tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32)/10000, (1, -1, 1))
        
        # for debugging purposes
        self.encodings = []
        #self.attn_layers = []
        last_encoding = self.wd_vec + self.position
        for _ in range(n_layers):
            attn_layer = AttentionLayer(last_encoding, input_mask = mask)
            last_encoding = attn_layer.output
            #last_encoding = LinearResNorm(attn_layer.e)
            #self.attn_layers.append(attn_layer)
            self.encodings.append(last_encoding)
        
        
class Decoder:
    def __init__(self, num_wds, wd_ind, encoder, ndims = 20, n_layers = 2):
        self.encoder = encoder
        self.encodings = encoder.encodings
        self.num_wds = num_wds
        self.wd_ind = wd_ind
        self.length = tf.shape(self.wd_ind)[1]
        self.num_wds = num_wds
        self.wd_emb = tf.Variable(
            tf.random_uniform([self.num_wds, ndims],minval = -1, maxval = 1.))
        self.wd_vec = tf.nn.embedding_lookup(self.wd_emb, wd_ind)
        self.position = tf.reshape(tf.range(tf.cast(self.length, tf.float32), dtype=tf.float32)/10000, (1, -1, 1))
        last_encoding =last_encoding = self.wd_vec + self.position
        for idx in range(n_layers):
            encodings = self.encodings[idx]
            self_attention = AttentionLayer(last_encoding, ff_layer = False).output
            second_attention = AttentionLayer(encodings, X_decode = self_attention).output
            last_encoding = second_attention
            #linear_res_norm = LinearResNorm(attn_layer.e)
            
        self.presoftmax_output = tf.layers.dense(last_encoding, num_wds)
        self.output = tf.nn.softmax(self.presoftmax_output)


class Transformer:
    def __init__(self, num_wds):
        self.num_wds = num_wds
        self.learning_rate = tf.placeholder(tf.float32, ())
        self.wd_ind_src = wd_ind_src = tf.placeholder(tf.int32, (None, None))
        self.wd_ind_trg = wd_ind_trg = tf.placeholder(tf.int32, (None, None))
        self.input_lengths = tf.placeholder(tf.int32, (None))
        self.output_lengths = tf.placeholder(tf.int32, (None))
        self.input_mask = tf.sequence_mask(
            self.input_lengths, maxlen = tf.shape(self.wd_ind_src)[-1], dtype = tf.float32)
        self.output_mask = tf.sequence_mask(
            self.output_lengths, maxlen = tf.shape(self.wd_ind_trg)[-1], dtype = tf.float32)
        self.encoder = Encoder(num_wds, wd_ind_src, mask = self.input_mask)
        self.decoder = Decoder(num_wds, wd_ind_trg, self.encoder)
        self.presoftmax_output = self.decoder.presoftmax_output
        self.output = self.decoder.output
        opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        loss = tf.nn.softmax_cross_entropy_with_logits_v2(
            labels = self.wd_ind_trg, logits = self.presoftmax_output)
        self.optimizer, self.grad_norm_total = apply_clipped_optimizer(opt, loss)

In [29]:
transfomer = Transformer(20)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [None]:
enc.encodings

In [None]:
for train_batch in train_it:
    src_tensor  = train_batch.src[0].data.cpu().numpy()
    src_len = train_batch.src[1].cpu().numpy()
    trg_tensor  = train_batch.trg[0].data.cpu().numpy()
    trg_len = train_batch.trg[1].cpu().numpy()
    print(src_tensor.shape, src_len.shape, trg_tensor.shape, trg_len.shape)
    print(src_tensor, src_len, trg_tensor, trg_len)
    return

In [None]:
train_batch

In [None]:
train_batch.src[0].data

In [None]:
from torchtext.datasets import WMT14

In [None]:
WMT14('data/', ('.en', '.de'))

In [None]:
# Load words from IWSLT

#!pip install torchtext spacy
#!python -m spacy download en
#!python -m spacy download de
from torchtext import data, datasets
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"
SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
TGT = data.Field(tokenize=tokenize_en, init_token = BOS_WORD, 
                 eos_token = EOS_WORD, pad_token=BLANK_WORD)

MAX_LEN = 100
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(SRC, TGT), 
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                         len(vars(x)['trg']) <= MAX_LEN)
MIN_FREQ = 1
SRC.build_vocab(train.src, min_freq=MIN_FREQ)
TGT.build_vocab(train.trg, min_freq=MIN_FREQ)

In [None]:
dir(train)

In [None]:
!pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl numpy matplotlib spacy torchtext seaborn 