# GPT-2 
---

In [1]:
import tensorflow as tf

In [2]:
tf.enable_eager_execution()

In [3]:
def nprint(*args):
    print(*args, end='\n\n-----------------\n')

---
# Expand tile

Let's look at the example from GPT-2.

In [60]:
# tl = tf.get_variable('tl', [2])
tl = tf.get_variable('tl', [2,3])
print(tl.numpy())

[[ 1.0839658   0.36698222  0.13701785]
 [-0.8101032   0.7864109  -1.0563312 ]]


In [61]:
size = 3
ndims = tl.shape.ndims
print(ndims)
print([size] + [1]*ndims)

2
[3, 1, 1]


Why do you need that `expand_dims` thingy? In order to stack your 2d vector into a 3d one. To keep it 2d and shove the copies/tiles inside, do this:

In [63]:
print(tf.tile(tl, [size] + [1]*(ndims-1)).numpy())

[[ 1.0839658   0.36698222  0.13701785]
 [-0.8101032   0.7864109  -1.0563312 ]
 [ 1.0839658   0.36698222  0.13701785]
 [-0.8101032   0.7864109  -1.0563312 ]
 [ 1.0839658   0.36698222  0.13701785]
 [-0.8101032   0.7864109  -1.0563312 ]]


In [64]:
print(tf.tile(tf.expand_dims(tl, axis=0), [size] + [1]*ndims).shape)
print(tf.tile(tf.expand_dims(tl, axis=0), [size] + [1]*ndims).numpy())

(3, 2, 3)
[[[ 1.0839658   0.36698222  0.13701785]
  [-0.8101032   0.7864109  -1.0563312 ]]

 [[ 1.0839658   0.36698222  0.13701785]
  [-0.8101032   0.7864109  -1.0563312 ]]

 [[ 1.0839658   0.36698222  0.13701785]
  [-0.8101032   0.7864109  -1.0563312 ]]]


---

The actual function:

In [79]:
def expand_tile(value, size):
    """Add a new axis of given size."""
    value = tf.convert_to_tensor(value, name='value')
    ndims = value.shape.ndims
    return tf.tile(tf.expand_dims(value, axis=0), [size] + [1]*ndims)

In [84]:
expt = tf.constant([[1,2,3],[4,5,6]])
nprint(expt.numpy())
print('-'*30)
nprint(expand_tile(expt, 2).numpy())
print('-'*30)
nprint(expand_tile(expt, 4).numpy())

[[1 2 3]
 [4 5 6]]

------------------------------
[[[1 2 3]
  [4 5 6]]

 [[1 2 3]
  [4 5 6]]]

------------------------------
[[[1 2 3]
  [4 5 6]]

 [[1 2 3]
  [4 5 6]]

 [[1 2 3]
  [4 5 6]]

 [[1 2 3]
  [4 5 6]]]



---
# Positions_for

`Size` is the batch size > will produce as many tilings (copies) of the vector as there are batches. E.g. `[1,2,3]` with three batches: `[[1,2,3],[1,2,3],[1,2,3]]`.

In [5]:
def expand_tile(value, size):
    """
    Tile (duplicate) tensor size times: from [x,y] to [[x,y],[x,y] .. size times .. [x,y]]
    Constructed so as to be able to take lists, tuples, etc. as input.
    """
    value = tf.convert_to_tensor(value, name='value')
    ndims = value.shape.ndims

    # expand [x,y] to [[x,y]], then tile [size] times according to the outer dim
    # ([size] + [1]*ndims turning into e.g. [3,1,1])
    return tf.tile(tf.expand_dims(value, axis=0), [size] + [1]*ndims)

In [57]:
print(expand_tile([0,1], 0).numpy())
print(expand_tile([0,1], 1).numpy())
print(expand_tile([0,1], 2).numpy())

[]
[[0 1]]
[[0 1]
 [0 1]]


In [99]:
def positions_for(tokens, past_length):
    batch_size = tf.shape(tokens)[0]
    nsteps = tf.shape(tokens)[1]
    
    print('batch_size:', batch_size.numpy(), 
          '| nsteps:', nsteps.numpy(), 
          '| tf.range(nsteps):', tf.range(nsteps).numpy(), 
          '| past length:', past_length if type(past_length) == int else past_length.numpy(),
          '\n\npast length + range:\n', (past_length + tf.range(nsteps)).numpy(),
          '\n\nresult vector:')
    
    return expand_tile(past_length + tf.range(nsteps), batch_size)

In [116]:
tkns = tf.cast(20*tf.get_variable('tkns', [5,2,3]),
               tf.int32)
print(tkns.numpy())
print(tkns.shape)

[[[-8 -1  0]
  [ 7  1 -6]]

 [[ 2 -1 -2]
  [ 8  2  8]]

 [[ 9  6  3]
  [-7  0  4]]

 [[ 5  2  4]
  [-9  7 -3]]

 [[-1 -2  6]
  [-3 -8 -6]]]
(5, 2, 3)


In [122]:
[nprint(unst.numpy()) for unst in tf.unstack(tkns, axis=1)]

[[-8 -1  0]
 [ 2 -1 -2]
 [ 9  6  3]
 [ 5  2  4]
 [-1 -2  6]]

-----------------
[[ 7  1 -6]
 [ 8  2  8]
 [-7  0  4]
 [-9  7 -3]
 [-3 -8 -6]]

-----------------


[None, None]

Here using `tf.shape(tkns)` as it is in the model, to show the results:  
```
past_length = 0 if past is None else tf.shape(past)[-2]
```
The value of `past_length` gets broadcast into the range.

In [102]:
nprint(positions_for(tkns, 0).numpy())
nprint(positions_for(tkns, tf.shape(tkns)[0]).numpy())
nprint(positions_for(tkns, tf.shape(tkns)[1]).numpy())
nprint(positions_for(tkns, tf.shape(tkns)[2]).numpy())

batch_size: 3 | nsteps: 5 | tf.range(nsteps): [0 1 2 3 4] | past length: 0 

past length + range:
 [0 1 2 3 4] 

result vector:
[[0 1 2 3 4]
 [0 1 2 3 4]
 [0 1 2 3 4]]

-----------------
batch_size: 3 | nsteps: 5 | tf.range(nsteps): [0 1 2 3 4] | past length: 3 

past length + range:
 [3 4 5 6 7] 

result vector:
[[3 4 5 6 7]
 [3 4 5 6 7]
 [3 4 5 6 7]]

-----------------
batch_size: 3 | nsteps: 5 | tf.range(nsteps): [0 1 2 3 4] | past length: 5 

past length + range:
 [5 6 7 8 9] 

result vector:
[[5 6 7 8 9]
 [5 6 7 8 9]
 [5 6 7 8 9]]

-----------------
batch_size: 3 | nsteps: 5 | tf.range(nsteps): [0 1 2 3 4] | past length: 1 

past length + range:
 [1 2 3 4 5] 

result vector:
[[1 2 3 4 5]
 [1 2 3 4 5]
 [1 2 3 4 5]]

-----------------
