# GPT-2 
---

In [4]:
import tensorflow as tf

In [5]:
tf.enable_eager_execution()

In [11]:
def shape_list(x):
    """Deal with dynamic shape in tensorflow cleanly."""
    static = x.shape.as_list()
    dynamic = tf.shape(x)
    return [dynamic[i] if s is None else s for i, s in enumerate(static)]

## 1D Convolution

Andrew Ng has an illustration of a 1D Convolution for CNNs in [this video](https://www.youtube.com/watch?v=vcp0XvDAX68), which can surely be a useful refresher.

The speed seems here to be the main motivator for not using [tf.nn.conv1d](https://www.tensorflow.org/api_docs/python/tf/nn/conv1d): cf. a [comment in the GPT-1 code](https://github.com/openai/finetune-transformer-lm/blob/a69b5c43b0452462890bca8ff92fb75dee9290cf/train.py#L111)

In [2]:
def conv1d(x, scope, nf, *, w_init_stdev=0.02):
    with tf.variable_scope(scope):
        *start, nx = shape_list(x)
        w = tf.get_variable('w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev))
        b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0))
        c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, start+[nf])
        return c

In [67]:
def conv1d_2(x, scope, nf, *, w_init_stdev=0.02):
    with tf.variable_scope(scope):
        
        # take the innermost dimension
        *start, nx = shape_list(x)
        
        print('x shape:', x.shape)
        print('innermost dim (nx):', nx)
        print('outside dims (start):', start)
        print(10*'-')

        # weight & bias for training
        # Why create w with shape [1, nx, nf] instead of [nx, nf]?
        # This seems to be a remnant of the GTP-1 architecture, where
        # the 1 was a variable passed to the whole function
        w = tf.get_variable('w', 
                            [1, nx, nf], 
                            initializer=tf.random_normal_initializer(stddev=w_init_stdev))
        
        b = tf.get_variable('b', 
                            [nf], 
                            initializer=tf.constant_initializer(0))
        
        
        # flatten everything 
        # (e.g. [10,5,2,5] becomes [100, 5], i.e. [10x5x2] and innermost dim)
        # (and w e.g. [1,5,3] becomes [5,3])
        x_reshaped = tf.reshape(x, [-1, nx])
        w_reshaped = tf.reshape(w, [-1, nf])
        
        print('w shape:', w.shape)
        print('w:\n', w.numpy())
        print(10*'-')
        
        print('w_reshaped:', w_reshaped.shape)
        print('w_reshaped:\n', w_reshaped.numpy())
        print(10*'-')
        
        w2 = tf.matmul(x_reshaped, w_reshaped)
        c = tf.reshape(w2+b, start+[nf])
        
        print('x reshaped:', x_reshaped.shape)
        print('x (first 5 values):\n', x_reshaped.numpy()[:5])
        print(10*'-')

        print('w2 (matmul(x_re, w_re)) shape:', w2.shape)
        print('c shape:', c.shape)
        
        return c

In [12]:
x = tf.get_variable('x', [10,5,2,5])
x

<tf.Variable 'x:0' shape=(10, 5, 2, 5) dtype=float32, numpy=
array([[[[-0.00455239,  0.06739672, -0.04978145,  0.06634033,
           0.08138317],
         [-0.02852962, -0.03153922, -0.03298388, -0.0563105 ,
           0.08055933]],

        [[-0.003591  ,  0.11244377, -0.06798231, -0.06322473,
           0.10237655],
         [-0.11534518, -0.09949131,  0.11689581,  0.02389619,
          -0.01095393]],

        [[-0.05434945, -0.05802821, -0.05542954, -0.0882737 ,
          -0.05307717],
         [ 0.03167467,  0.02818625,  0.0651602 , -0.10278717,
           0.11627939]],

        [[-0.07831947,  0.03970619,  0.05314769,  0.12612824,
          -0.10867291],
         [-0.05633794, -0.06827946,  0.06636262, -0.04513508,
           0.05086902]],

        [[-0.09135126, -0.07113906,  0.11304821, -0.04278421,
           0.04237419],
         [ 0.00776525, -0.04959308,  0.04468486,  0.00835571,
           0.00746669]]],


       [[[ 0.00972158,  0.12440111,  0.1176756 , -0.10291337,
     

In [68]:
conv1d_2(x, 'x_conv1d_2', 3)

x shape: (10, 5, 2, 5)
innermost dim (nx): 5
outside dims (start): [10, 5, 2]
----------
w shape: (1, 5, 3)
w:
 [[[-0.00268271 -0.00696958 -0.01017623]
  [ 0.01068275 -0.01160656  0.01048357]
  [-0.01308253 -0.01872865  0.0032939 ]
  [-0.02599971  0.00395536 -0.02325181]
  [ 0.02176557 -0.00607897  0.00708164]]]
----------
w_reshaped: (5, 3)
w_reshaped:
 [[-0.00268271 -0.00696958 -0.01017623]
 [ 0.01068275 -0.01160656  0.01048357]
 [-0.01308253 -0.01872865  0.0032939 ]
 [-0.02599971  0.00395536 -0.02325181]
 [ 0.02176557 -0.00607897  0.00708164]]
----------
x reshaped: (100, 5)
x (first 5 values):
 [[-0.00455239  0.06739672 -0.04978145  0.06634033  0.08138317]
 [-0.02852962 -0.03153922 -0.03298388 -0.0563105   0.08055933]
 [-0.003591    0.11244377 -0.06798231 -0.06322473  0.10237655]
 [-0.11534518 -0.09949131  0.11689581  0.02389619 -0.01095393]
 [-0.05434945 -0.05802821 -0.05542954 -0.0882737  -0.05307717]]
----------
w2 (matmul(x_re, w_re)) shape: (100, 3)
c shape: (10, 5, 2, 3)


<tf.Tensor: id=1199, shape=(10, 5, 2, 3), dtype=float32, numpy=
array([[[[ 1.42998330e-03, -5.05017233e-05, -3.77297343e-04],
         [ 3.38859949e-03,  4.70199564e-04,  1.73084741e-03]],

        [[ 5.97233046e-03, -8.79260188e-04,  3.18651064e-03],
         [-3.14240810e-03, -6.95355120e-05, -1.17402691e-04]],

        [[ 1.39089755e-03,  2.06392165e-03,  1.43880013e-03],
         [ 4.56699589e-03, -2.88168574e-03,  3.40123125e-03]],

        [[-5.70565183e-03,  2.49118049e-04, -2.31396873e-03],
         [ 8.34227365e-04, -5.45497693e-04,  1.48579560e-03]],

        [[ 4.08274936e-05, -1.08169974e-03,  1.85107917e-03],
         [-1.18994224e-03, -3.27742717e-04, -5.93154982e-04]]],


       [[[ 3.80771467e-03, -4.50483616e-03,  4.43106890e-03],
         [-7.41416775e-03, -6.62053179e-04, -4.40531410e-03]],

        [[-1.61367352e-03,  3.27301328e-03, -2.58999318e-03],
         [ 1.57999992e-03,  8.01340619e-04,  4.29695734e-04]],

        [[ 3.20529332e-03,  3.03030759e-03,  1.98293

---

### Mini-reminder

Always good: when multiplying matrices A x B, the 'inner dimensions' (inner of A, outer of B) must agree, and the 'outer dimensions' (outer of A, inner of B), will yield the dimension of the resulting matrix. Cf. two pics purloined [from here](https://www.mathbootcamps.com/multiplying-matrices/) and [from there](http://www.javatechblog.com/java/java-program-for-matrix-multiplication/) respectively:  
![Matmul](matrix-product-is-defined.jpg "Matrix multiplication recap") ![Matmul2](matrix-multiplication.jpg "Matrix multiplication recap 2")

---

### Tensor contraction

Another question that springs to mind: what happens if you were to want to multiply two tensors directly, without reshaping them? There is [a discussion on the Stack](https://math.stackexchange.com/questions/63074/is-there-a-3-dimensional-matrix-by-matrix-product) about that topic, which is technically called [tensor contraction](https://en.wikipedia.org/wiki/Tensor_contraction).  
It is reassuring to see that there also exists [a TF implementation](https://www.tensorflow.org/api_docs/python/tf/tensordot).