In [None]:
import fire
import json
import os
import numpy as np
import tensorflow as tf
import tqdm
from tensorflow.core.protobuf import rewriter_config_pb2
import glob
import pickle

import model
from encoder import get_encoder

tf.__version__

In [None]:
cd ../data/gpt-2/

In [None]:
!pip3 install -r requirements.txt

# Sample from Model

In [None]:
def top_k_logits(logits, k):
    if k == 0:
        # no truncation
        return logits

    def _top_k():
        values, _ = tf.nn.top_k(logits, k=k)
        min_values = values[:, -1, tf.newaxis]
        return tf.compat.v1.where(
            logits < min_values,
            tf.ones_like(logits, dtype=logits.dtype) * -1e10,
            logits,
        )
    return tf.cond(
       pred=tf.equal(k, 0),
       true_fn=lambda: logits,
       false_fn=lambda: _top_k(),
    )


def sample_sequence(*, hparams, length, start_token=None, batch_size=None, context=None, past=None, temperature=1, top_k=0):
    if start_token is None:
        assert context is not None, 'Specify exactly one of start_token and context!'
    else:
        assert context is None, 'Specify exactly one of start_token and context!'
        context = tf.fill([batch_size, 1], start_token)

    def step(hparams, tokens, past=None):
        lm_output = model.model(hparams=hparams, X=tokens, past=past, reuse=tf.compat.v1.AUTO_REUSE)

        logits = lm_output['logits'][:, :, :hparams.n_vocab]
        presents = lm_output['present']
        presents.set_shape(model.past_shape(hparams=hparams, batch_size=batch_size))
        return {
            'logits': logits,
            'presents': presents,
            'hidden_state': lm_output['hidden_state']
        }

    def body(past, prev, output, embedding):
        next_outputs = step(hparams, prev, past=past)
        logits = next_outputs['logits'][:, -1, :]  / tf.cast(temperature, dtype=tf.float32)
        logits = top_k_logits(logits, k=top_k)
        samples = tf.random.categorical(logits=logits, num_samples=1, dtype=tf.int32)
        return [
            next_outputs['presents'] if past is None else tf.concat([past, next_outputs['presents']], axis=-2),
            samples,
            tf.concat([output, samples], axis=1),
            next_outputs['hidden_state']
        ]

    past, prev, output, h = body(past, context, context, context)

    def cond(*args):
        return True

    return output, past, h

# Embedding Methods

In [None]:
import math
class Embedder:
    
    def __init__(self, chkpt_path, chunk_size):
        tf.compat.v1.disable_eager_execution()
        self.g = tf.Graph()
        with self.g.as_default():
            self.context = tf.compat.v1.placeholder(tf.int32, [1, None])

        self.sess = tf.compat.v1.Session(graph=self.g)
    
        self.MAX_CHUNK = chunk_size
        self.enc = get_encoder("117M", "models")
        hparams = model.default_hparams()
        with self.g.as_default():
            self.output, self.past, self.hidden_state = sample_sequence(
                hparams=hparams, length=None,
                context=self.context,
                past=None,
                batch_size=1,
                temperature=1, top_k=1
            )
        
        if chkpt_path is not None:
            self.restore(chkpt_path)
            
    def restore(self, chkpt_path):
        with self.g.as_default():
            saver = tf.compat.v1.train.Saver()
            chkpt = tf.train.latest_checkpoint(chkpt_path)
            saver.restore(self.sess, chkpt)
        
    def __call__(self, method):
        with self.g.as_default():

            p = None
            for i in range(math.ceil(len(method) / self.MAX_CHUNK)):
                chunk = method[i * self.MAX_CHUNK : (i + 1) * self.MAX_CHUNK]
                context_tokens = self.enc.encode(chunk)

                if p is None:
                    out, p, h = self.sess.run([self.output, self.past, self.hidden_state], feed_dict={
                        self.context: [context_tokens]
                    }, options = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True))
                else:
                    out, p, h = self.sess.run([self.output, self.past, self.hidden_state], feed_dict={
                        self.context: [context_tokens],
                        self.past: p
                    }, options = tf.compat.v1.RunOptions(report_tensor_allocations_upon_oom = True))

            return np.squeeze(h)

# Generate Features for all Methods

In [None]:
embd = Embedder("/tf/src/data/gpt-2/checkpoint/run3", 1024)
path = "/tf/src/data/methods/DATA00M_[god-r]"

features = {}
for i, fname in enumerate(tqdm.tqdm(os.listdir(path))):
    if i => 10000: break
    with open(os.path.join(path, fname)) as f:
        method = f.read()
        features[method] = np.sum(embd(method), axis = 0)

In [None]:
with open('/tf/src/data/feature_space.pickle', 'wb') as f:
    pickle.dump(features, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
!curl -X POST -H 'Content-type: application/json' --data '{"text":"from: semeru tower 1\nstatus: model finished training"}' https://hooks.slack.com/services/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

In [None]:
for key in features:
    print(key, features[key].shape)