# Easily export jupyter cells to python module
https://github.com/fastai/course-v3/blob/master/nbs/dl2/notebook2script.py

In [None]:
! python /tf/src/scripts/notebook2script.py embedding.ipynb

In [1]:
#export
from new_model import *
from encoder import get_encoder
import math
import tqdm
import os
import tensorflow as tf
import numpy as np

tf.__version__

'2.0.0-beta1'

In [2]:
cd /tf/src/data/gpt-2

/tf/src/data/gpt-2


In [None]:
! pip3 install -r requirements.txt

In [None]:
! python3 download_model.py 117M

In [6]:
#export
def top_k_logits(logits, k):
    if k == 0:
        # no truncation
        return logits

    def _top_k():
        values, _ = tf.nn.top_k(logits, k=k)
        min_values = values[:, -1, tf.newaxis]
        return tf.compat.v1.where(
            logits < min_values,
            tf.ones_like(logits, dtype=logits.dtype) * -1e10,
            logits,
        )
    return tf.cond(
       pred=tf.equal(k, 0),
       true_fn=lambda: logits,
       false_fn=lambda: _top_k(),
    )


def sample_sequence(*, hparams, length, start_token=None, batch_size=None, context=None, past=None, temperature=1, top_k=0):
    if start_token is None:
        assert context is not None, 'Specify exactly one of start_token and context!'
    else:
        assert context is None, 'Specify exactly one of start_token and context!'
        context = tf.fill([batch_size, 1], start_token)

    def step(hparams, tokens, past=None):
        lm_output = model(hparams=hparams, X=tokens, past=past, reuse=tf.compat.v1.AUTO_REUSE)

        logits = lm_output['logits'][:, :, :hparams.n_vocab]
        presents = lm_output['present']
        presents.set_shape(past_shape(hparams=hparams, batch_size=batch_size))
        return {
            'logits': logits,
            'presents': presents,
            'hidden_state': lm_output['hidden_state'],
            'clf_h': lm_output['clf_h'],
            'clf_logits': lm_output['clf_logits']
        }

    def body(past, prev, output, embedding):
        next_outputs = step(hparams, prev, past=past)
        logits = next_outputs['logits'][:, -1, :]  / tf.cast(temperature, dtype=tf.float32)
        logits = top_k_logits(logits, k=top_k)
        samples = tf.random.categorical(logits=logits, num_samples=1, dtype=tf.int32)
        return [
            next_outputs['presents'] if past is None else tf.concat([past, next_outputs['presents']], axis=-2),
            samples,
            tf.concat([output, samples], axis=1),
            next_outputs['hidden_state'],
            next_outputs['clf_h'],
            next_outputs['clf_logits']
        ]

    past, prev, output, h, clf_h, clf_logits = body(past, context, context, context)

    def cond(*args):
        return True

    return output, h, clf_h, clf_logits

In [7]:
#export
class Embedder:
    def __init__(self, chkpt_path, chunk_size):
        tf.compat.v1.disable_eager_execution()
        self.g = tf.Graph()
        with self.g.as_default():
            self.context = tf.compat.v1.placeholder(tf.int32, [1, None])

        self.sess = tf.compat.v1.Session(graph=self.g)
    
        self.MAX_CHUNK = chunk_size
        self.enc = get_encoder("117M", "models")
        hparams = default_hparams()
        with self.g.as_default():
            self.output, self.hidden_state, self.clf_h, self.clf_logits = sample_sequence(
                hparams=hparams, length=None,
                context=self.context,
                past=None,
                batch_size=1,
                temperature=1, top_k=1
            )
        
        if chkpt_path is not None:
            self.restore(chkpt_path)
            
    def restore(self, chkpt_path):
        with self.g.as_default():
            saver = tf.compat.v1.train.Saver()
            chkpt = tf.train.latest_checkpoint(chkpt_path)
            saver.restore(self.sess, chkpt)
        
    def __call__(self, method):
        with self.g.as_default():
            enc_meth = self.enc.encode(method)
            context_tokens = enc_meth[:self.MAX_CHUNK]

            _, h, clf_h, clf_logits = self.sess.run([self.output, self.hidden_state, self.clf_h, self.clf_logits], feed_dict={
                self.context: [context_tokens]
            }, options = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True))
                        
#             for tok in enc_meth[self.MAX_CHUNK:]:
#                 context_tokens.append(tok)
#                 context_tokens.pop(0)
                
#                 _, h_ = self.sess.run([self.output, self.hidden_state], feed_dict={
#                     self.context: [context_tokens]
#                 }, options = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True))
#                 h = np.append(h, h_[None, :, -1], axis = 1)

            print(clf_h.shape, clf_logits.shape)

            return h #np.squeeze(h)

In [None]:
a = np.array([[[1, 2, 3], [1, 2, 3]]])
b = np.array([[[1, 2, 4], [4, 5, 6]]])
# b = b[None, :, -1]
# b[None, :, -1].shape
# b.shape
a = np.append(a, b[None, :, -1], axis = 1)
a

In [None]:
#export
def generate_embeddings_from_files(chkpt_path, ds_path, samples = None, MAX_CHUNK = 1024):
    embd = Embedder(chkpt_path, MAX_CHUNK)
    features = []
    for i, fname in enumerate(tqdm.tqdm(os.listdir(ds_path))):
        if samples is not None:
            if i >= samples: break
        
        with open(os.path.join(ds_path, fname)) as f:
            method = f.read()
            features.append(embd(method)) #np.sum(embd(method), axis = 0)
            
    return features

In [None]:
#export
def generate_embeddings_from_list(chkpt_path, methods, out_path, samples = None, MAX_CHUNK = 1024):
    embd = Embedder(chkpt_path, MAX_CHUNK)
    shape = (len(methods), MAX_CHUNK, 768)
    if samples is not None:
        shape = (samples, MAX_CHUNK, 768)
    
    features = np.memmap(out_path, dtype='float32', mode='w+', shape = shape)
    for i, method in enumerate(tqdm.tqdm(methods)):
        if samples is not None:
            if i >= samples: break
                
        features[i] = np.squeeze(
            tf.keras.preprocessing.sequence.pad_sequences(embd(method), MAX_CHUNK, dtype='float32', padding='post')
        )
            #np.sum(embd(method), axis = 0)
            
    return features

In [None]:
#export
# Generates embeddings from multiple text files where each method
# is on a different line in each file
def generate_embeddings_from_text_files(chkpt_path, files, out_path, samples = None, MAX_CHUNK = 1024):
    embd = Embedder(chkpt_path, MAX_CHUNK)
#     shape = (len(methods), MAX_CHUNK, 768)
    if samples is not None:
        shape = (samples, MAX_CHUNK, 768)
    
    features = np.memmap(out_path, dtype='float32', mode='w+', shape = shape)
    for _, file in enumerate(files):
        with open(file) as f:
            for i, method in enumerate(tqdm.tqdm(f.readlines())):
                if samples is not None:
                    if i >= samples: break
                        
                features[i] = np.squeeze(
                    tf.keras.preprocessing.sequence.pad_sequences(embd(method), MAX_CHUNK, dtype='float32', padding='post')
                ) #np.sum(embd(method), axis = 0)
            
    return features

In [None]:
features = generate_embeddings("/tf/src/data/gpt-2/checkpoint/run1",
                               "/tf/src/data/methods/DATA00M_[god-r]/test",
                               samples = 10
                              )

len(features)

In [10]:
embd = Embedder(None, 1024)
path = "/tf/src/data/methods/DATA00M_[god-r]/test"

features = {}
for i, fname in enumerate(tqdm.tqdm(os.listdir(path))):
#     if i => 10000: break
    with open(os.path.join(path, fname)) as f:
        method = f.read()
        features[method] = embd(method)
        print(f"Feature {i} dims:", features[method].shape)

W0814 21:41:50.652522 139688576931648 nn_ops.py:4220] Large dropout rate: 0.9 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
  0%|          | 0/121596 [00:00<?, ?it/s]


InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument: Input to reshape is a tensor with 137 values, but the requested shape requires a multiple of 2048
	 [[node model/Reshape (defined at /tf/src/fineTuning/new_model.py:184) ]]
	 [[model/Reshape_4/_45]]
  (1) Invalid argument: Input to reshape is a tensor with 137 values, but the requested shape requires a multiple of 2048
	 [[node model/Reshape (defined at /tf/src/fineTuning/new_model.py:184) ]]
0 successful operations.
0 derived errors ignored.

Errors may have originated from an input operation.
Input Source operations connected to node model/Reshape:
 Placeholder (defined at <ipython-input-7-a4cf76f8252f>:7)

Input Source operations connected to node model/Reshape:
 Placeholder (defined at <ipython-input-7-a4cf76f8252f>:7)

Original stack trace for 'model/Reshape':
  File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py", line 505, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.6/dist-packages/tornado/platform/asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 438, in run_forever
    self._run_once()
  File "/usr/lib/python3.6/asyncio/base_events.py", line 1451, in _run_once
    handle._run()
  File "/usr/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/usr/local/lib/python3.6/dist-packages/tornado/ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/usr/local/lib/python3.6/dist-packages/tornado/ioloop.py", line 743, in _run_callback
    ret = callback()
  File "/usr/local/lib/python3.6/dist-packages/tornado/gen.py", line 781, in inner
    self.run()
  File "/usr/local/lib/python3.6/dist-packages/tornado/gen.py", line 742, in run
    yielded = self.gen.send(value)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 365, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/usr/local/lib/python3.6/dist-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 272, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/usr/local/lib/python3.6/dist-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py", line 542, in execute_request
    user_expressions, allow_stdin,
  File "/usr/local/lib/python3.6/dist-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2848, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2874, in _run_cell
    return runner(coro)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 3049, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 3214, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-d1e91cd04ef5>", line 1, in <module>
    embd = Embedder(None, 1024)
  File "<ipython-input-7-a4cf76f8252f>", line 20, in __init__
    temperature=1, top_k=1
  File "<ipython-input-6-fb43dc412a3f>", line 57, in sample_sequence
    past, prev, output, h, clf_h, clf_logits = body(past, context, context, context)
  File "<ipython-input-6-fb43dc412a3f>", line 44, in body
    next_outputs = step(hparams, prev, past=past)
  File "<ipython-input-6-fb43dc412a3f>", line 30, in step
    lm_output = model(hparams=hparams, X=tokens, past=past, reuse=tf.compat.v1.AUTO_REUSE)
  File "/tf/src/fineTuning/new_model.py", line 184, in model
    X = tf.reshape(X, [-1, hparams.n_ctx, 2])
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 7715, in reshape
    "Reshape", tensor=tensor, shape=shape, name=name)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 3296, in create_op
    op_def=op_def)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py", line 1692, in __init__
    self._traceback = tf_stack.extract_stack()


In [None]:
# API look
def cross_entropy(ds, chkpt_path, MAX_CHUNK = 1024, bs = 1):
    # Generate permutations (TODO)
    
    
    with tf.compat.v1.Session(graph=tf.Graph()) as sess:
        context = tf.compat.v1.placeholder(tf.int32, [bs, None])
        # Generate predictions of model
        output  = model(hparams=default_hparams(), X=context)
        # Calculate cross entropy using tf library
        loss1    = tf.reduce_mean(
            input_tensor = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels = context[:, 1:], logits = output['logits'][:, :-1]
            )
        )
        loss2    = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels = context[:, 1:], logits = output['logits'][:, :-1]
        )
        
        # Load model
        saver = tf.compat.v1.train.Saver()
        ckpt = tf.train.latest_checkpoint(chkpt_path)
        saver.restore(sess, ckpt)
    
        # For each method:
        entropy = []
        for method in tqdm.tqdm(ds):
            enc_meth    = enc.encode(method)
            
            context_tokens = enc_meth[:MAX_CHUNK]
            val = sess.run(loss1, feed_dict={context: [context_tokens]})
            if not math.isnan(val):
                entropy.append(val)
#             for i in range(len(enc_meth) % MAX_CHUNK):
#                 context_tokens = enc_meth[MAX_CHUNK * i:MAX_CHUNK * (i + 1)]
#                 print(len(tok))
            for tok in enc_meth[MAX_CHUNK:]:
                context_tokens.append(tok)
                context_tokens.pop(0)
    
#                 # Need to recalculate this because this is not correct way (I think)
                val = sess.run(loss2, feed_dict={context: [context_tokens]})
                if not math.isnan(val[:, -1]):
                    entropy.append(val[:, -1])
        
        return entropy