# GPT-2 
---
## Study notebook

Mostly TF utils & standard functions

In [7]:
import tensorflow as tf
import numpy as np

In [2]:
tf.enable_eager_execution()

---

## Shape list

Why the dynamic thing in the first place? See [this comment](https://stackoverflow.com/a/34082273) It is to deal with the difference with dynamic and static shapes: when data flows through the network on a batch per batch basis, the shapes will be, for instance, [None, x, y, z], and therefore the shape is not defined statically.

In [79]:
def shape_list(x):
    """Deal with dynamic shape in tensorflow cleanly."""
    static = x.shape.as_list()
    dynamic = tf.shape(x)
    return [dynamic[i] if s is None else s for i, s in enumerate(static)]

In [77]:
def shape_list_comm(x):
    """Deal with dynamic shape in tensorflow cleanly."""
    static = x.shape.as_list()
    print('static:', static)
    dynamic = tf.shape(x)
    print('dynamic:', dynamic)
    for i, s in enumerate(static):
        print(s)
        print(dynamic[i])
    return [dynamic[i] if s is None else s for i, s in enumerate(static)]

In [119]:
t1 = tf.random.normal([4, 3,2], 
                      mean=0.0, 
                      stddev=20,
                      dtype=tf.float32)
print(tf.cast(t1, tf.int16)) # casting to int for readability

tf.Tensor(
[[[-10 -21]
  [-14   9]
  [  1  -8]]

 [[-26   8]
  [  7 -14]
  [-33 -21]]

 [[ 25  14]
  [ -5  -8]
  [ 48  15]]

 [[ 13  18]
  [-36   8]
  [-45   5]]], shape=(4, 3, 2), dtype=int16)


In [78]:
shape_list_comm(t1)

static: [4, 3, 2]
dynamic: tf.Tensor([4 3 2], shape=(3,), dtype=int32)
4
tf.Tensor(4, shape=(), dtype=int32)
3
tf.Tensor(3, shape=(), dtype=int32)
2
tf.Tensor(2, shape=(), dtype=int32)


[4, 3, 2]

---
### Fun star operator (python)
(used e.g. line 46)

In [85]:
*start, m = shape_list(t1)
print(start)
print(m)

[4, 3]
2


---

## Reduce_mean, reduce_max 

Play with this & other shape fluff

In [263]:
xmean = tf.constant([[1., 1.],
                     [3.,2.],
                     [5., 9.]])
print(tf.reduce_mean(xmean)) # A scalar
print(tf.reduce_mean(xmean, axis=0)) # 'vertical' mean
print(tf.reduce_mean(xmean, axis=1)) # 'horizontal/internal' mean

tf.Tensor(3.5, shape=(), dtype=float32)
tf.Tensor([3. 4.], shape=(2,), dtype=float32)
tf.Tensor([1.  2.5 7. ], shape=(3,), dtype=float32)


In [182]:
print(tf.reduce_max(xmean))
print(tf.reduce_max(xmean, axis=0))
print(tf.reduce_max(xmean, axis=-1)) # innermost axis
print(tf.reduce_max(xmean, axis=0, keepdims=True))
print(tf.reduce_max(xmean, axis=-1, keepdims=True))

tf.Tensor(9.0, shape=(), dtype=float32)
tf.Tensor([5. 9.], shape=(2,), dtype=float32)
tf.Tensor([1. 3. 9.], shape=(3,), dtype=float32)
tf.Tensor([[5. 9.]], shape=(1, 2), dtype=float32)
tf.Tensor(
[[1.]
 [3.]
 [9.]], shape=(3, 1), dtype=float32)


In [72]:
xmean.shape.as_list()

[3, 2]

In [92]:
xmean.ndim

2

In [73]:
xmean.shape

TensorShape([Dimension(3), Dimension(2)])

In [138]:
xmean.shape[1]

Dimension(2)

In [91]:
xmean.shape[1].value

2

In [74]:
tf.shape(xmean)

<tf.Tensor: id=616, shape=(2,), dtype=int32, numpy=array([3, 2], dtype=int32)>

In [103]:
print(tf.reshape(xmean, [2,3]))
print(tf.reshape(xmean, [1,6]))

tf.Tensor(
[[1. 1. 3.]
 [2. 5. 9.]], shape=(2, 3), dtype=float32)
tf.Tensor([[1. 1. 3. 2. 5. 9.]], shape=(1, 6), dtype=float32)


---

## Splitting & merging states

In [120]:
def split_states(x, n):
    """Reshape the last dimension of x into [n, x.shape[-1]/n]."""
    *start, m = shape_list(x)
    return tf.reshape(x, start + [n, m//n])

In [129]:
splitx = tf.get_variable("splitx",
                         [2,3,4],
                         tf.float32,
                         initializer=tf.glorot_uniform_initializer)

In [130]:
splitx

<tf.Variable 'splitx:0' shape=(2, 3, 4) dtype=float32, numpy=
array([[[ 0.09519899, -0.5179945 ,  0.60571945, -0.61901444],
        [ 0.5326501 ,  0.6256759 , -0.21402407, -0.07976532],
        [-0.14802265,  0.11726612,  0.35453045,  0.42391026]],

       [[ 0.45691597,  0.24667728,  0.04992068, -0.42152926],
        [ 0.07240939, -0.40548953,  0.5585022 ,  0.50473773],
        [-0.22778407,  0.28633028, -0.44296736, -0.0016287 ]]],
      dtype=float32)>

In [137]:
*start, m = shape_list(splitx)
n = 2
print(start)
print(m)
print(start + [n, m//n])

[2, 3]
4
[2, 3, 2, 2]


In [131]:
splitx_states = split_states(splitx, 2)
splitx_states

<tf.Tensor: id=799, shape=(2, 3, 2, 2), dtype=float32, numpy=
array([[[[ 0.09519899, -0.5179945 ],
         [ 0.60571945, -0.61901444]],

        [[ 0.5326501 ,  0.6256759 ],
         [-0.21402407, -0.07976532]],

        [[-0.14802265,  0.11726612],
         [ 0.35453045,  0.42391026]]],


       [[[ 0.45691597,  0.24667728],
         [ 0.04992068, -0.42152926]],

        [[ 0.07240939, -0.40548953],
         [ 0.5585022 ,  0.50473773]],

        [[-0.22778407,  0.28633028],
         [-0.44296736, -0.0016287 ]]]], dtype=float32)>

In [132]:
def merge_states(x):
    """Smash the last two dimensions of x into a single dimension."""
    *start, a, b = shape_list(x)
    return tf.reshape(x, start + [a*b])

In [133]:
re_splitx = merge_states(splitx_states)
re_splitx

<tf.Tensor: id=803, shape=(2, 3, 4), dtype=float32, numpy=
array([[[ 0.09519899, -0.5179945 ,  0.60571945, -0.61901444],
        [ 0.5326501 ,  0.6256759 , -0.21402407, -0.07976532],
        [-0.14802265,  0.11726612,  0.35453045,  0.42391026]],

       [[ 0.45691597,  0.24667728,  0.04992068, -0.42152926],
        [ 0.07240939, -0.40548953,  0.5585022 ,  0.50473773],
        [-0.22778407,  0.28633028, -0.44296736, -0.0016287 ]]],
      dtype=float32)>

---

### Convert to tensor

In [23]:
def nprint(*args):
    print(*args, end='\n\n')

In [3]:
def myconvertor(arg):
    return tf.convert_to_tensor(arg)

In [25]:
nprint(myconvertor(tf.constant([10,2,3]))) # tf tensors
nprint(myconvertor(2)) # scalars
nprint(myconvertor([1,2,3])) # lists
nprint(myconvertor((1,2,3))) # tuples
nprint(myconvertor([[1,2,3],[4,5,6]])) # more lists
nprint(myconvertor(np.array([1,2,3]))) # numpy

tf.Tensor([10  2  3], shape=(3,), dtype=int32)

tf.Tensor(2, shape=(), dtype=int32)

tf.Tensor([1 2 3], shape=(3,), dtype=int32)

tf.Tensor([1 2 3], shape=(3,), dtype=int32)

tf.Tensor(
[[1 2 3]
 [4 5 6]], shape=(2, 3), dtype=int32)

tf.Tensor([1 2 3], shape=(3,), dtype=int64)



---

### TF expand dims

Documentation [here](https://www.tensorflow.org/api_docs/python/tf/expand_dims).


> Given a tensor `input`, this operation inserts a dimension of 1 at the dimension index `axis` of `input`'s shape. The dimension index `axis` starts at zero; if you specify a negative number for `axis` it is counted backward from the end.  
>
> This operation is useful if you want to add a batch dimension to a single element. For example, if you have a single image of shape `[height, width, channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`, which will make the shape `[1, height, width, channels]`.

In [32]:
dms = tf.get_variable('dms', [4,3,2])
print(dms.shape)
print(dms.numpy())

(4, 3, 2)
[[[-0.4956274   0.49113512]
  [-0.41286486  0.35620266]
  [-0.4334615   0.15532726]]

 [[-0.28003323 -0.22134745]
  [-0.14616278 -0.09085426]
  [-0.46632233  0.31075937]]

 [[-0.5278535   0.41998708]
  [-0.35841405  0.29415405]
  [ 0.02429056 -0.39607567]]

 [[ 0.00269699  0.23218775]
  [ 0.5431715   0.40847325]
  [-0.31780624  0.08248389]]]


In [34]:
nprint(tf.expand_dims(dms, 0).shape)
nprint(tf.expand_dims(dms, 0).numpy())

(1, 4, 3, 2)

[[[[-0.4956274   0.49113512]
   [-0.41286486  0.35620266]
   [-0.4334615   0.15532726]]

  [[-0.28003323 -0.22134745]
   [-0.14616278 -0.09085426]
   [-0.46632233  0.31075937]]

  [[-0.5278535   0.41998708]
   [-0.35841405  0.29415405]
   [ 0.02429056 -0.39607567]]

  [[ 0.00269699  0.23218775]
   [ 0.5431715   0.40847325]
   [-0.31780624  0.08248389]]]]



In [37]:
nprint(tf.expand_dims(dms, 1).shape)
nprint(tf.expand_dims(dms, 1).numpy())

(4, 1, 3, 2)

[[[[-0.4956274   0.49113512]
   [-0.41286486  0.35620266]
   [-0.4334615   0.15532726]]]


 [[[-0.28003323 -0.22134745]
   [-0.14616278 -0.09085426]
   [-0.46632233  0.31075937]]]


 [[[-0.5278535   0.41998708]
   [-0.35841405  0.29415405]
   [ 0.02429056 -0.39607567]]]


 [[[ 0.00269699  0.23218775]
   [ 0.5431715   0.40847325]
   [-0.31780624  0.08248389]]]]



In [38]:
nprint(tf.expand_dims(dms, 2).shape)
nprint(tf.expand_dims(dms, 2).numpy())

(4, 3, 1, 2)

[[[[-0.4956274   0.49113512]]

  [[-0.41286486  0.35620266]]

  [[-0.4334615   0.15532726]]]


 [[[-0.28003323 -0.22134745]]

  [[-0.14616278 -0.09085426]]

  [[-0.46632233  0.31075937]]]


 [[[-0.5278535   0.41998708]]

  [[-0.35841405  0.29415405]]

  [[ 0.02429056 -0.39607567]]]


 [[[ 0.00269699  0.23218775]]

  [[ 0.5431715   0.40847325]]

  [[-0.31780624  0.08248389]]]]



In [39]:
nprint(tf.expand_dims(dms, 3).shape)
nprint(tf.expand_dims(dms, 3).numpy())

(4, 3, 2, 1)

[[[[-0.4956274 ]
   [ 0.49113512]]

  [[-0.41286486]
   [ 0.35620266]]

  [[-0.4334615 ]
   [ 0.15532726]]]


 [[[-0.28003323]
   [-0.22134745]]

  [[-0.14616278]
   [-0.09085426]]

  [[-0.46632233]
   [ 0.31075937]]]


 [[[-0.5278535 ]
   [ 0.41998708]]

  [[-0.35841405]
   [ 0.29415405]]

  [[ 0.02429056]
   [-0.39607567]]]


 [[[ 0.00269699]
   [ 0.23218775]]

  [[ 0.5431715 ]
   [ 0.40847325]]

  [[-0.31780624]
   [ 0.08248389]]]]



---

### TF Tile

Documentation [here](https://www.tensorflow.org/api_docs/python/tf/tile).


> `tf.tile(input, multiples, name=None)`

> This operation creates a new tensor by replicating `input multiples` times. The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements, and the values of `input` are replicated `multiples[i]` times along the 'i'th dimension. For example, tiling `[a b c d]` by `[2]` produces `[a b c d a b c d]`.


In [77]:
tt = tf.constant([0,1,2,3,4])
nprint(tf.tile(tt, [2]).numpy())
print('-'*30)
tt = tf.constant([[0,1,2],[3,4,5]])
nprint(tf.tile(tt, [1, 2]).numpy())
nprint(tf.tile(tt, [2, 1]).numpy())
nprint(tf.tile(tt, [2, 2]).numpy())
print('-'*30)
tt = tf.constant([[[0,1],[1,2]],[[3,4],[4,5]]])
nprint(tf.tile(tt, [1, 1, 2]).numpy())
nprint(tf.tile(tt, [1, 2, 1]).numpy())
nprint(tf.tile(tt, [1, 2, 2]).numpy())
nprint(tf.tile(tt, [2, 1, 2]).numpy())
nprint(tf.tile(tt, [2, 2, 1]).numpy())
nprint(tf.tile(tt, [2, 2, 2]).numpy())

[0 1 2 3 4 0 1 2 3 4]

------------------------------
[[0 1 2 0 1 2]
 [3 4 5 3 4 5]]

[[0 1 2]
 [3 4 5]
 [0 1 2]
 [3 4 5]]

[[0 1 2 0 1 2]
 [3 4 5 3 4 5]
 [0 1 2 0 1 2]
 [3 4 5 3 4 5]]

------------------------------
[[[0 1 0 1]
  [1 2 1 2]]

 [[3 4 3 4]
  [4 5 4 5]]]

[[[0 1]
  [1 2]
  [0 1]
  [1 2]]

 [[3 4]
  [4 5]
  [3 4]
  [4 5]]]

[[[0 1 0 1]
  [1 2 1 2]
  [0 1 0 1]
  [1 2 1 2]]

 [[3 4 3 4]
  [4 5 4 5]
  [3 4 3 4]
  [4 5 4 5]]]

[[[0 1 0 1]
  [1 2 1 2]]

 [[3 4 3 4]
  [4 5 4 5]]

 [[0 1 0 1]
  [1 2 1 2]]

 [[3 4 3 4]
  [4 5 4 5]]]

[[[0 1]
  [1 2]
  [0 1]
  [1 2]]

 [[3 4]
  [4 5]
  [3 4]
  [4 5]]

 [[0 1]
  [1 2]
  [0 1]
  [1 2]]

 [[3 4]
  [4 5]
  [3 4]
  [4 5]]]

[[[0 1 0 1]
  [1 2 1 2]
  [0 1 0 1]
  [1 2 1 2]]

 [[3 4 3 4]
  [4 5 4 5]
  [3 4 3 4]
  [4 5 4 5]]

 [[0 1 0 1]
  [1 2 1 2]
  [0 1 0 1]
  [1 2 1 2]]

 [[3 4 3 4]
  [4 5 4 5]
  [3 4 3 4]
  [4 5 4 5]]]



Let's look at the example from GPT-2.

In [60]:
# tl = tf.get_variable('tl', [2])
tl = tf.get_variable('tl', [2,3])
print(tl.numpy())

[[ 1.0839658   0.36698222  0.13701785]
 [-0.8101032   0.7864109  -1.0563312 ]]


In [61]:
size = 3
ndims = tl.shape.ndims
print(ndims)
print([size] + [1]*ndims)

2
[3, 1, 1]


Why do you need that `expand_dims` thingy? In order to stack your 2d vector into a 3d one. To keep it 2d and shove the copies/tiles inside, do this:

In [63]:
print(tf.tile(tl, [size] + [1]*(ndims-1)).numpy())

[[ 1.0839658   0.36698222  0.13701785]
 [-0.8101032   0.7864109  -1.0563312 ]
 [ 1.0839658   0.36698222  0.13701785]
 [-0.8101032   0.7864109  -1.0563312 ]
 [ 1.0839658   0.36698222  0.13701785]
 [-0.8101032   0.7864109  -1.0563312 ]]


In [64]:
print(tf.tile(tf.expand_dims(tl, axis=0), [size] + [1]*ndims).shape)
print(tf.tile(tf.expand_dims(tl, axis=0), [size] + [1]*ndims).numpy())

(3, 2, 3)
[[[ 1.0839658   0.36698222  0.13701785]
  [-0.8101032   0.7864109  -1.0563312 ]]

 [[ 1.0839658   0.36698222  0.13701785]
  [-0.8101032   0.7864109  -1.0563312 ]]

 [[ 1.0839658   0.36698222  0.13701785]
  [-0.8101032   0.7864109  -1.0563312 ]]]


---

The actual function:

In [79]:
def expand_tile(value, size):
    """Add a new axis of given size."""
    value = tf.convert_to_tensor(value, name='value')
    ndims = value.shape.ndims
    return tf.tile(tf.expand_dims(value, axis=0), [size] + [1]*ndims)

In [84]:
expt = tf.constant([[1,2,3],[4,5,6]])
nprint(expt.numpy())
print('-'*30)
nprint(expand_tile(expt, 2).numpy())
print('-'*30)
nprint(expand_tile(expt, 4).numpy())

[[1 2 3]
 [4 5 6]]

------------------------------
[[[1 2 3]
  [4 5 6]]

 [[1 2 3]
  [4 5 6]]]

------------------------------
[[[1 2 3]
  [4 5 6]]

 [[1 2 3]
  [4 5 6]]

 [[1 2 3]
  [4 5 6]]

 [[1 2 3]
  [4 5 6]]]

