# GPT-2 
---

In [4]:
import tensorflow as tf

In [5]:
tf.enable_eager_execution()

In [11]:
def shape_list(x):
    """Deal with dynamic shape in tensorflow cleanly."""
    static = x.shape.as_list()
    dynamic = tf.shape(x)
    return [dynamic[i] if s is None else s for i, s in enumerate(static)]

## 1D Convolution

Andrew Ng has an illustration of a 1D Convolution for CNNs in [this video](https://www.youtube.com/watch?v=vcp0XvDAX68), which can surely be a useful refresher.

The speed seems here to be the main motivator for not using [tf.nn.conv1d](https://www.tensorflow.org/api_docs/python/tf/nn/conv1d): cf. a [comment in the GPT-1 code](https://github.com/openai/finetune-transformer-lm/blob/a69b5c43b0452462890bca8ff92fb75dee9290cf/train.py#L111)

In [2]:
def conv1d(x, scope, nf, *, w_init_stdev=0.02):
    with tf.variable_scope(scope):
        *start, nx = shape_list(x)
        w = tf.get_variable('w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev))
        b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0))
        c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, start+[nf])
        return c

In [98]:
def conv1d_2(x, scope, nf, *, w_init_stdev=0.02):
    with tf.variable_scope(scope):
        
        # take the innermost dimension
        *start, nx = shape_list(x)
        
        print('x shape:', x.shape)
        print('innermost dim (nx):', nx)
        print('outside dims (start):', start)
        print(10*'-')

        # weight & bias for training
        # Why create w with shape [1, nx, nf] instead of [nx, nf]?
        # This seems to be a remnant of the GTP-1 architecture, where
        # the 1 was a variable passed to the whole function
        # w = tf.get_variable('w', 
        #                   [1, nx, nf], 
        #                   initializer=tf.random_normal_initializer(stddev=w_init_stdev))
        
        # Seems worth the try?
        w = tf.get_variable('w', 
                    [nx, nf], 
                    initializer=tf.random_normal_initializer(stddev=w_init_stdev))
        
        b = tf.get_variable('b', 
                            [nf], 
                            initializer=tf.constant_initializer(0))
        
        
        # flatten everything 
        # (e.g. [10,5,2,5] becomes [100, 5], i.e. [10x5x2] and innermost dim)
        # (and w e.g. [1,5,3] becomes [5,3])
        x_reshaped = tf.reshape(x, [-1, nx])
        
        # no need of this 
        # w_reshaped = tf.reshape(w, [-1, nf])
        
        print('w shape:', w.shape)
        print('w:\n', w.numpy())
        print(10*'-')
        
        # print('w_reshaped:', w_reshaped.shape)
        # print('w_reshaped:\n', w_reshaped.numpy())
        # print(10*'-')
        
        # w2 = tf.matmul(x_reshaped, w_reshaped)
        w2 = tf.matmul(x_reshaped, w)
        c = tf.reshape(w2+b, start+[nf])
        
        print('x reshaped:', x_reshaped.shape)
        print('x (first 5 values):\n', x_reshaped.numpy()[:5])
        print(10*'-')

        print('w2 (matmul(x_re, w_re)) shape:', w2.shape)
        print('c shape:', c.shape)
        
        return c

In [88]:
x = tf.get_variable('x', [10,5,2,5])
x

<tf.Variable 'x:0' shape=(10, 5, 2, 5) dtype=float32, numpy=
array([[[[ 0.07658765,  0.04895405,  0.124433  , -0.05023674,
           0.04844861],
         [ 0.12777634,  0.12834941, -0.01176266,  0.00315703,
           0.04139599]],

        [[ 0.06210099, -0.11199301,  0.04728961,  0.0943329 ,
          -0.06406869],
         [-0.07931602,  0.00194503,  0.10381359,  0.12318434,
          -0.059056  ]],

        [[ 0.10104392,  0.05950567,  0.05337623,  0.1072495 ,
          -0.06844246],
         [-0.06089064,  0.00720017, -0.09416069,  0.04207127,
           0.06000267]],

        [[-0.0563634 ,  0.06557035,  0.04362001, -0.07150251,
          -0.08896986],
         [-0.06255972,  0.07716727, -0.00532429, -0.01867719,
           0.07079968]],

        [[ 0.10159498, -0.0923056 , -0.02886729,  0.08614555,
           0.06152003],
         [-0.1159213 ,  0.00027074, -0.12597306,  0.04455833,
          -0.07581854]]],


       [[[-0.01885048,  0.01167329, -0.10316488, -0.03801158,
     

In [102]:
xconv1 = conv1d(x, 'x_conv1d', 3)
print(xconv1.shape)

(10, 5, 2, 3)


In [99]:
xconv2 = conv1d_2(x, 'x_conv1d_2', 3)

x shape: (10, 5, 2, 5)
innermost dim (nx): 5
outside dims (start): [10, 5, 2]
----------
w shape: (5, 3)
w:
 [[-0.07940796  0.00197585  0.00018248]
 [-0.00122991  0.0344775  -0.01567264]
 [ 0.00232023  0.0014641   0.00430911]
 [ 0.01905786  0.04127663 -0.00423453]
 [-0.00066937  0.00336457  0.02230392]]
----------
x reshaped: (100, 5)
x (first 5 values):
 [[ 0.07658765  0.04895405  0.124433   -0.05023674  0.04844861]
 [ 0.12777634  0.12834941 -0.01176266  0.00315703  0.04139599]
 [ 0.06210099 -0.11199301  0.04728961  0.0943329  -0.06406869]
 [-0.07931602  0.00194503  0.10381359  0.12318434 -0.059056  ]
 [ 0.10104392  0.05950567  0.05337623  0.1072495  -0.06844246]]
----------
w2 (matmul(x_re, w_re)) shape: (100, 3)
c shape: (10, 5, 2, 3)
