# GPT-2 
---


In [1]:
import tensorflow as tf

In [2]:
tf.enable_eager_execution()

## Softmax function

In [3]:
def softmax(x, axis=-1):
    x = x - tf.reduce_max(x, axis=axis, keepdims=True)
    ex = tf.exp(x)
    return ex / tf.reduce_sum(ex, axis=axis, keepdims=True)

Where does this business with the reduce_max comes from? 

The regular softmax is computed using the tf built-in functions in [GPT-1](https://github.com/openai/finetune-transformer-lm/blob/master/train.py). 

[This](https://github.com/purple-worthy/shendu-xuexi-python/blob/f75a43cc98c0cb9c004ff4a6dfdfa72f41445143/_codes/my_tensorflow/src/activations/__init__.py#L49) and [that](https://github.com/yifannieumontreal/artifact/blob/81f78c9ad7b9c10cc13b53ccb2fa3fe30ed07405/lib_tf/tf_utils.py#L15) have this feature. One better reference: the [Keras implementation](https://github.com/keras-team/keras/blob/bd024a1fc1cd6d88e8bc5da148968ff5e079caeb/keras/activations.py#L14) has it in this form.  
Or, rather, it is a form of normalization (pushing all values below zero, with the max value becoming zero...)?

In [4]:
x = tf.constant([[1.,2.,3.],
                 [4.,5.,6.],
                 [7.,8.,9.]])

print('x',x.numpy(), sep='\n', end='\n----------\n')
print('innermost max', tf.reduce_max(x, axis=-1, keepdims=True).numpy(), sep='\n', end='\n----------\n')
x2 = x - tf.reduce_max(x, axis=-1, keepdims=True)
print('x2: x - max', x2.numpy(), sep='\n', end='\n----------\n')
ex = tf.exp(x2)
print('ex: exp(x2)', ex.numpy(), sep='\n', end='\n----------\n')
print('ex/sum(ex)', (ex/tf.reduce_sum(ex, axis=-1, keepdims=True)).numpy(), sep='\n', end='\n----------\n')
print('softmax', softmax(x).numpy(), sep='\n', end='\n----------\n')
print('all equal?', softmax(x).numpy() == (ex/tf.reduce_sum(ex, axis=-1, keepdims=True)).numpy(), sep='\n', end='\n----------\n')
print('and instead the regular softmax', tf.nn.softmax(x).numpy(), sep='\n')

x
[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]
----------
innermost max
[[3.]
 [6.]
 [9.]]
----------
x2: x - max
[[-2. -1.  0.]
 [-2. -1.  0.]
 [-2. -1.  0.]]
----------
ex: exp(x2)
[[0.13533528 0.36787945 1.        ]
 [0.13533528 0.36787945 1.        ]
 [0.13533528 0.36787945 1.        ]]
----------
ex/sum(ex)
[[0.09003057 0.24472848 0.66524094]
 [0.09003057 0.24472848 0.66524094]
 [0.09003057 0.24472848 0.66524094]]
----------
softmax
[[0.09003057 0.24472848 0.66524094]
 [0.09003057 0.24472848 0.66524094]
 [0.09003057 0.24472848 0.66524094]]
----------
all equal?
[[ True  True  True]
 [ True  True  True]
 [ True  True  True]]
----------
and instead the regular softmax
[[0.09003057 0.24472848 0.66524094]
 [0.09003057 0.24472848 0.66524094]
 [0.09003057 0.24472848 0.66524094]]


---
Same with another variable:

In [5]:
y = tf.constant([[1., 1.],
                [3.,2.],
                [5., 9.]])
print('y',y.numpy(), sep='\n', end='\n----------\n')
print('innermost max', tf.reduce_max(y, axis=-1, keepdims=True).numpy(), sep='\n', end='\n----------\n')
y2 = y - tf.reduce_max(y, axis=-1, keepdims=True)
print('y2: y - max', y2.numpy(), sep='\n', end='\n----------\n')
exy = tf.exp(y2)
print('exy: exp(y2)', exy.numpy(), sep='\n', end='\n----------\n')
print('exy/sum(exy)', (exy/tf.reduce_sum(exy, axis=-1, keepdims=True)).numpy(), sep='\n', end='\n----------\n')
print('softmax', softmax(y).numpy(), sep='\n', end='\n----------\n')
print('all equal?', softmax(y).numpy() == (exy/tf.reduce_sum(exy, axis=-1, keepdims=True)).numpy(), sep='\n')

y
[[1. 1.]
 [3. 2.]
 [5. 9.]]
----------
innermost max
[[1.]
 [3.]
 [9.]]
----------
y2: y - max
[[ 0.  0.]
 [ 0. -1.]
 [-4.  0.]]
----------
exy: exp(y2)
[[1.         1.        ]
 [1.         0.36787945]
 [0.01831564 1.        ]]
----------
exy/sum(exy)
[[0.5        0.5       ]
 [0.7310586  0.26894143]
 [0.01798621 0.98201376]]
----------
softmax
[[0.5        0.5       ]
 [0.7310586  0.26894143]
 [0.01798621 0.98201376]]
----------
all equal?
[[ True  True]
 [ True  True]
 [ True  True]]


---
Hunch: maybe it is performance issue? The current function being faster than the [built-in one](https://www.tensorflow.org/api_docs/python/tf/nn/softmax)? Given the tests below, it does look like this is the case.

In [6]:
%%timeit

softmax(x)

80.7 µs ± 2.71 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [7]:
%%timeit
tf.nn.softmax(x)

11.2 µs ± 349 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Perhaps different with a larger tensor?

In [8]:
t1 = tf.random.normal([4,3,2],
                     mean=0.0,
                     stddev=20,
                     dtype=tf.float32)
print(t1)

tf.Tensor(
[[[ 14.639416    12.471731  ]
  [  6.287172   -10.907554  ]
  [-29.439354    -0.95316064]]

 [[-15.58526      6.7059565 ]
  [ 48.06688    -12.80599   ]
  [ 30.347822    25.95503   ]]

 [[  3.0517344    5.536769  ]
  [  6.2067804   19.347296  ]
  [-24.396057    19.345951  ]]

 [[ 31.135847   -21.138882  ]
  [  4.2857533  -42.758995  ]
  [-52.921906    16.973293  ]]], shape=(4, 3, 2), dtype=float32)


In [9]:
%%timeit
softmax(t1)

87.1 µs ± 7.59 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [10]:
%%timeit
tf.nn.softmax(t1)

12.9 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [11]:
t2 = tf.random.normal([36,10, 4, 3,2], 
                      mean=0.0, 
                      stddev=20,
                      dtype=tf.float32)

In [12]:
%%timeit
softmax(t2)

521 µs ± 50.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [13]:
%%timeit
tf.nn.softmax(t2)

189 µs ± 10.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
