# GPT-2 
---

In [1]:
import tensorflow as tf

In [2]:
tf.enable_eager_execution()

In [101]:
def nprint(*args):
    print(*args, end='\n\n-----------------\n')

---
### Expand Tile, Positions

`Size` is the batch size > will produce as many tilings (copies) of the vector as there are batches. E.g. `[1,2,3]` with three batches: `[[1,2,3],[1,2,3],[1,2,3]]`.

In [5]:
def expand_tile(value, size):
    """
    Tile (duplicate) tensor size times: from [x,y] to [[x,y],[x,y] .. size times .. [x,y]]
    Constructed so as to be able to take lists, tuples, etc. as input.
    """
    value = tf.convert_to_tensor(value, name='value')
    ndims = value.shape.ndims

    # expand [x,y] to [[x,y]], then tile [size] times according to the outer dim
    # ([size] + [1]*ndims turning into e.g. [3,1,1])
    return tf.tile(tf.expand_dims(value, axis=0), [size] + [1]*ndims)

In [57]:
print(expand_tile([0,1], 0).numpy())
print(expand_tile([0,1], 1).numpy())
print(expand_tile([0,1], 2).numpy())

[]
[[0 1]]
[[0 1]
 [0 1]]


In [99]:
def positions_for(tokens, past_length):
    batch_size = tf.shape(tokens)[0]
    nsteps = tf.shape(tokens)[1]
    
    print('batch_size:', batch_size.numpy(), 
          '| nsteps:', nsteps.numpy(), 
          '| tf.range(nsteps):', tf.range(nsteps).numpy(), 
          '| past length:', past_length if type(past_length) == int else past_length.numpy(),
          '\n\npast length + range:\n', (past_length + tf.range(nsteps)).numpy(),
          '\n\nresult vector:')
    
    return expand_tile(past_length + tf.range(nsteps), batch_size)

In [116]:
tkns = tf.cast(20*tf.get_variable('tkns', [5,2,3]),
               tf.int32)
print(tkns.numpy())
print(tkns.shape)

[[[-8 -1  0]
  [ 7  1 -6]]

 [[ 2 -1 -2]
  [ 8  2  8]]

 [[ 9  6  3]
  [-7  0  4]]

 [[ 5  2  4]
  [-9  7 -3]]

 [[-1 -2  6]
  [-3 -8 -6]]]
(5, 2, 3)


In [122]:
[nprint(unst.numpy()) for unst in tf.unstack(tkns, axis=1)]

[[-8 -1  0]
 [ 2 -1 -2]
 [ 9  6  3]
 [ 5  2  4]
 [-1 -2  6]]

-----------------
[[ 7  1 -6]
 [ 8  2  8]
 [-7  0  4]
 [-9  7 -3]
 [-3 -8 -6]]

-----------------


[None, None]

Here using `tf.shape(tkns)` as it is in the model, to show the results:  
```
past_length = 0 if past is None else tf.shape(past)[-2]
```
The value of `past_length` gets broadcast into the range.

In [102]:
nprint(positions_for(tkns, 0).numpy())
nprint(positions_for(tkns, tf.shape(tkns)[0]).numpy())
nprint(positions_for(tkns, tf.shape(tkns)[1]).numpy())
nprint(positions_for(tkns, tf.shape(tkns)[2]).numpy())

batch_size: 3 | nsteps: 5 | tf.range(nsteps): [0 1 2 3 4] | past length: 0 

past length + range:
 [0 1 2 3 4] 

result vector:
[[0 1 2 3 4]
 [0 1 2 3 4]
 [0 1 2 3 4]]

-----------------
batch_size: 3 | nsteps: 5 | tf.range(nsteps): [0 1 2 3 4] | past length: 3 

past length + range:
 [3 4 5 6 7] 

result vector:
[[3 4 5 6 7]
 [3 4 5 6 7]
 [3 4 5 6 7]]

-----------------
batch_size: 3 | nsteps: 5 | tf.range(nsteps): [0 1 2 3 4] | past length: 5 

past length + range:
 [5 6 7 8 9] 

result vector:
[[5 6 7 8 9]
 [5 6 7 8 9]
 [5 6 7 8 9]]

-----------------
batch_size: 3 | nsteps: 5 | tf.range(nsteps): [0 1 2 3 4] | past length: 1 

past length + range:
 [1 2 3 4 5] 

result vector:
[[1 2 3 4 5]
 [1 2 3 4 5]
 [1 2 3 4 5]]

-----------------


---

### TF Fill

Nice and easy, fill a tensor of a certain shape with a value. 

Reference [here](https://www.tensorflow.org/api_docs/python/tf/fill):

`tf.fill` differs from `tf.constant` in a few ways:
- `tf.fill` only supports scalar contents, whereas `tf.constant` supports Tensor values.
- `tf.fill` creates an Op in the computation graph that constructs the actual Tensor value at runtime. This is in contrast to `tf.constant` which embeds the entire Tensor into the graph with a `Const` node.
- Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes based on other runtime Tensors, unlike `tf.constant.`


In [123]:
tf.fill([2,3], 9)

<tf.Tensor: id=4120, shape=(2, 3), dtype=int32, numpy=
array([[9, 9, 9],
       [9, 9, 9]], dtype=int32)>

In [247]:
batch_size = 3
print(tf.fill([batch_size, 1], 'blah'))

tf.Tensor(
[[b'blah']
 [b'blah']
 [b'blah']], shape=(3, 1), dtype=string)


---

TF nn.top_k / math.top_k

Documentation [here](https://www.tensorflow.org/api_docs/python/tf/math/top_k).

> Finds values and indices of the `k` largest entries for the last dimension.

(k=1: only the max, k=2: the 2 largest ones, etc.)

(For the opposite, see [this answer on the Stack](https://stackoverflow.com/a/44553559): `-tf.nn.top_k(-A)` will do the same as `tf.negative(tf.nn.top_k(tf.negative(A)))`.)


In [152]:
topk = tf.constant([[1,2,3],[4,5,6],[7,8,9]])
print(topk.numpy(),end='\n\n')
[print(tp.numpy(),end='\n\n') for tp in tf.nn.top_k(topk, 2)]

[[1 2 3]
 [4 5 6]
 [7 8 9]]

[[3 2]
 [6 5]
 [9 8]]

[[2 1]
 [2 1]
 [2 1]]



[None, None]

Always goes to the last dimension (obviously) to fetch the values, and keeps dimensions intact.

In [163]:
topk = tf.constant(np.arange(24), shape=[2,3,4])
print(topk.numpy(),end='\n\n')
[print(tp.numpy(),end='\n\n') for tp in tf.nn.top_k(topk, 1)]

[[[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]]

 [[12 13 14 15]
  [16 17 18 19]
  [20 21 22 23]]]

[[[ 3]
  [ 7]
  [11]]

 [[15]
  [19]
  [23]]]

[[[3]
  [3]
  [3]]

 [[3]
  [3]
  [3]]]



[None, None]

And now to the negative (minimum values)(as well as upping my game tensorwise):

In [166]:
topk = tf.constant(np.arange(48), shape=[2,2,3,4])
print(topk.numpy(),end='\n\n')
[print(-tp.numpy(),end='\n\n') for tp in tf.nn.top_k(-topk, 1)]

[[[[ 0  1  2  3]
   [ 4  5  6  7]
   [ 8  9 10 11]]

  [[12 13 14 15]
   [16 17 18 19]
   [20 21 22 23]]]


 [[[24 25 26 27]
   [28 29 30 31]
   [32 33 34 35]]

  [[36 37 38 39]
   [40 41 42 43]
   [44 45 46 47]]]]

[[[[ 0]
   [ 4]
   [ 8]]

  [[12]
   [16]
   [20]]]


 [[[24]
   [28]
   [32]]

  [[36]
   [40]
   [44]]]]

[[[[0]
   [0]
   [0]]

  [[0]
   [0]
   [0]]]


 [[[0]
   [0]
   [0]]

  [[0]
   [0]
   [0]]]]



[None, None]

In [204]:
print(tkns.numpy())

[[[-8 -1  0]
  [ 7  1 -6]]

 [[ 2 -1 -2]
  [ 8  2  8]]

 [[ 9  6  3]
  [-7  0  4]]

 [[ 5  2  4]
  [-9  7 -3]]

 [[-1 -2  6]
  [-3 -8 -6]]]


In [212]:
min_values, _ = tf.nn.top_k(-tkns, k=1)
print(min_values[:,-1, None].numpy())

[[[ 6]]

 [[-2]]

 [[ 7]]

 [[ 9]]

 [[ 8]]]


In [215]:
values, _ = tf.nn.top_k(tkns, k=1) # max k for each tensor in last dim
print(values.numpy())
min_values = values[:, -1, tf.newaxis]
print(min_values.numpy())

[[[ 0]
  [ 7]]

 [[ 2]
  [ 8]]

 [[ 9]
  [ 4]]

 [[ 5]
  [ 7]]

 [[ 6]
  [-3]]]
[[[ 7]]

 [[ 8]]

 [[ 4]]

 [[ 7]]

 [[-3]]]


---

### TF Where

Reference [here](https://www.tensorflow.org/api_docs/python/tf/where).

Return the elements, either from x or y, depending on the condition.

### Ones/Zeros like

Reference [here](https://www.tensorflow.org/api_docs/python/tf/ones_like) and [here](https://www.tensorflow.org/api_docs/python/tf/zeros_like).

Return tensor filled with ones/zeros of the shape of tensor input.

### TF Cond

Reference [here]().

Implements conditionals in graphs (hell). Will be simplified in TF2.

Note: in order to pass arguments to the called functions, you need to build them as lambdas, as explained [on the Stack](https://stackoverflow.com/a/39573566). 

In [243]:
whr = tf.constant([1,2,3,5,6,7,8])

def give_ones(whr):
    return tf.where(whr < 6,
                    whr,
                    tf.ones_like(whr))
    
def give_zeros(whr):
        return tf.where(whr < 6,
                    whr,
                    tf.zeros_like(whr))
        
# the zeros return
print(tf.cond(whr.shape == 6,
       lambda: give_ones(whr), 
       lambda: give_zeros(whr)).numpy())

# the ones return
print(tf.cond(whr.shape == 7,
       lambda: give_ones(whr), 
       lambda: give_zeros(whr)).numpy())

[1 2 3 5 0 0 0]
[1 2 3 5 1 1 1]


---

### TF While loop

Reference [here](https://www.tensorflow.org/api_docs/python/tf/while_loop).

In [378]:
def cond(*args):
    return True

def body(x, y, z):
    print(x-y-z, -x+y-z, x+y+z)
    print()
    return [x-y-z, -x+y-z, x+y+z] # needs to return the same shape as its input, 
                                  # as these will be fed in the next iteration
tf.while_loop(
    cond=cond,
    body=body,
    loop_vars=[1,2,3],
    back_prop=False,
    maximum_iterations=10)

-4 -2 6

-8 -4 0

-4 4 -12

4 20 -12

-4 28 12

-44 20 36

-100 28 12

-140 116 -60

-196 316 -84

-428 596 36



[-428, 596, 36]

---

### TF Multinomial/Categorical

[Multinomial](https://www.tensorflow.org/api_docs/python/tf/random/multinomial): Draws samples from a multinomial distribution. (says it's deprecated but is the only one working in tf 1.12...).  
[Categorical](https://www.tensorflow.org/api_docs/python/tf/random/categorical): Draws samples from a categorical distribution.

Returns the index of the sample (<-> its class, in our case which token/character has been taken).

In [364]:
tflg = tf.random.normal([1,100], mean=0.0, stddev=2) 
tflg -= tf.reduce_min(tflg)
tflg += 1e-10
tflg = tf.log(tflg)
print(tflg.shape)
print(tflg.numpy())

(1, 100)
[[ 1.6614330e+00  1.8465611e+00  1.2419925e+00  1.8122669e+00
   1.8083012e+00  1.3323863e+00  2.1042039e+00  2.0606184e+00
   1.2898836e+00  1.2235551e+00  1.3590236e+00  2.0356238e+00
   1.9787459e+00  1.5589540e+00  1.8931404e+00  1.2853543e+00
   1.5432556e+00  1.2538865e+00  1.6112630e+00  1.6507741e+00
  -1.2851691e-02  1.9858840e+00  1.6952589e+00  2.1319540e+00
  -6.1222482e-01  1.9517276e+00  1.6481156e+00  1.3833491e+00
   1.3723737e+00  9.0254259e-01  1.5697762e+00  2.0009382e+00
  -1.6136631e-01  7.5235856e-01  1.3757418e+00  1.6696959e+00
   1.7772396e+00  1.2080791e+00  1.7911468e+00  1.9001979e+00
   1.2330620e+00  1.7922444e+00  1.7293684e+00  1.8890799e+00
   2.2396121e+00  1.7890160e+00  2.1262302e+00  1.9232343e+00
   1.5086039e+00  1.4677848e+00  8.3263904e-01  1.3818003e+00
   6.6127515e-01  2.0995927e+00  1.3393655e+00  1.6124418e+00
   1.4841000e+00  1.4615728e+00  1.9389315e+00  1.1315494e+00
   1.7599026e+00  1.3196810e+00  1.6478530e+00 -2.3025850e+01

In [367]:
print(tf.multinomial( # same as random.multinomial
        logits=tflg, 
        num_samples=1).numpy(), 
      end='\n\n') 
# print(samples = tf.random.categorical(tf.log([[10., 10.]]), 5), end='\n\n') # ain't working (will in more recent versions)

[[3]]

