In [1]:
%%time
import pandas as pd
# texts = clean_movie_data(data)
df = pd.read_csv("data/All-seasons.csv")

CPU times: user 320 ms, sys: 368 ms, total: 688 ms
Wall time: 317 ms


In [2]:
df.head()

Unnamed: 0,Season,Episode,Character,Line
0,10,1,Stan,"You guys, you guys! Chef is going away. \n"
1,10,1,Kyle,Going away? For how long?\n
2,10,1,Stan,Forever.\n
3,10,1,Chef,I'm sorry boys.\n
4,10,1,Stan,"Chef said he's been bored, so he joining a gro..."


In [3]:
texts = df["Line"].values

In [4]:
texts[:10]

array(['You guys, you guys! Chef is going away. \n',
       'Going away? For how long?\n', 'Forever.\n', "I'm sorry boys.\n",
       "Chef said he's been bored, so he joining a group called the Super Adventure Club. \n",
       'Wow!\n',
       'Chef?? What kind of questions do you think adventuring around the world is gonna answer?!\n',
       "What's the meaning of life? Why are we here?\n",
       "I hope you're making the right choice.\n",
       "I'm gonna miss him.  I'm gonna miss Chef and I...and I don't know how to tell him! \n"], dtype=object)

In [5]:
%%time
from hedgeable_ai.functions.preprocessing.word2index import Word2IndexProcessor

processor = Word2IndexProcessor(texts[:], is_processed=False)

Using TensorFlow backend.


CPU times: user 43min 51s, sys: 15.2 s, total: 44min 6s
Wall time: 44min 7s


In [6]:
import tensorflow as tf

In [62]:
from __future__ import print_function
import tensorflow as tf

def layer_normalization(inputs, epsilon=1e-8, scope="layer_normalization", reuse=None):
    '''Applies layer normalization.
    
    Args:
      inputs: A tensor with 2 or more dimensions, where the first dimension has
        `batch_size`.
      epsilon: A floating number. A very small number for preventing ZeroDivision Error.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
      
    Returns:
      A tensor with the same shape and data dtype as `inputs`.
    '''
    with tf.variable_scope(scope, reuse=reuse):
        shape = inputs.get_shape().as_list()
        hidden_dim = shape[-1]
        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        gamma = tf.get_variable("gamma", hidden_dim, initializer=tf.ones_initializer(tf.float32))
        beta = tf.get_variable("beta", hidden_dim, initializer=tf.zeros_initializer(tf.float32))
        normalized_inputs = (inputs - mean) / tf.sqrt(variance + epsilon)
        outputs = gamma * normalized_inputs + beta
    return outputs

def embedding(inputs,vocab_size, num_units, 
              zero_pad=True, scale=True,
              scope="embedding", reuse=None):
    '''Embeds a given tensor.
    Args:
      inputs: A `Tensor` with type `int32` or `int64` containing the ids
         to be looked up in `lookup table`.
      vocab_size: An int. Vocabulary size.
      num_units: An int. Number of embedding hidden units.
      zero_pad: A boolean. If True, all the values of the fist row (id 0)
        should be constant zeros.
      scale: A boolean. If True. the outputs is multiplied by sqrt num_units.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
    Returns:
      A `Tensor` with one more rank than inputs's. The last dimensionality
        should be `num_units`.
        
    For example,
    
    ```
    import tensorflow as tf
    
    inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
    outputs = embedding(inputs, 6, 2, zero_pad=True)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print sess.run(outputs)
    >>
    [[[ 0.          0.        ]
      [ 0.09754146  0.67385566]
      [ 0.37864095 -0.35689294]]
     [[-1.01329422 -1.09939694]
      [ 0.7521342   0.38203377]
      [-0.04973143 -0.06210355]]]
    ```
    
    ```
    import tensorflow as tf
    
    inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
    outputs = embedding(inputs, 6, 2, zero_pad=False)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print sess.run(outputs)
    >>
    [[[-0.19172323 -0.39159766]
      [-0.43212751 -0.66207761]
      [ 1.03452027 -0.26704335]]
     [[-0.11634696 -0.35983452]
      [ 0.50208133  0.53509563]
      [ 1.22204471 -0.96587461]]]    
    ```    
    '''
    with tf.variable_scope(scope, reuse=reuse):
        lookup_table = tf.get_variable('lookup_table',
                                       dtype=tf.float32,
                                       shape=[vocab_size, num_units],
                                       initializer=tf.contrib.layers.xavier_initializer())
        if zero_pad:
            lookup_table = tf.concat((tf.zeros(shape=[1, num_units]),
                                      lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, inputs)
        
        if scale:
            outputs = outputs * tf.sqrt(num_units) 
            
    return outputs
    
def multihead_attention(queries, 
                        keys,
                        values,
                        num_units=None, 
                        num_heads=8, 
                        drop_rate=0,
                        training=True,
                        causality=False,
                        scope="multihead_attention", 
                        reuse=None):
    '''Applies multihead attention.
    
    Args:
      queries: A 3d tensor with shape of [N, T_q, C_q].
      keys: A 3d tensor with shape of [N, T_k, C_k].
      values: A 3d tensor with shape of [N, T_k, C_k].
      num_units: A scalar. Attention size.
      drop_rate: A floating point number.
      training: Boolean. Controller of mechanism for dropout.
      causality: Boolean. If true, units that reference the future are masked. 
      num_heads: An int. Number of heads.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
        
    Returns
      A 3d tensor with shape of (N, T_q, C)  
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Set the fall back option for num_units
        if num_units is None:
            num_units = queries.get_shape().as_list()[-1]
        
        # Linear projections
        Q = tf.layers.dense(queries, num_units, activation=None) # (N, T_q, C)
        K = tf.layers.dense(keys, num_units, activation=None) # (N, T_k, C)
        V = tf.layers.dense(values, num_units, activation=None) # (N, T_k, C)
        
        # Split and concat
        # The size will be [N * num_heads, T_k, C_k/num_heads]
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) 
        K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 
        V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) 

        # Multiplication
        alignments = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)
        
        # Scale
        d = tf.constant(K_.get_shape().as_list()[-1], tf.float32)
        alignments = tf.nn.softmax(alignments / tf.sqrt(d))
  
        # Causality = Future blinding
        if causality:
            shape = tf.shape(alignments)
            masks = tf.ones(shape[1:]) # (T_q, T_k)
            masks = tf.contrib.linalg.LinearOperatorTriL(masks).to_dense() # (T_q, T_k)
            masks = tf.tile(tf.expand_dims(masks, 0), [shape[0], 1, 1]) # (h*N, T_q, T_k)
            alignments = alignments * masks
            sum_alignments = tf.reduce_sum(alignments, -1, keep_dims=True)
            sum_alignments = tf.tile(sum_alignments, [1, 1, shape[-1]])
            alingments = alignments / sum_alignments
          
        # Dropouts
        alignments = tf.layers.dropout(alignments, rate=drop_rate, training=training)
        # Weighted sum
        outputs = tf.matmul(alignments, V_) # ( h*N, T_q, C/h)
        # Restore shape
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, C)
        # Residual connection
        outputs += queries 
        # Normalize
        outputs = layer_normalization(outputs) # (N, T_q, C)
    return outputs

def feedforward(inputs, 
                num_units=[2048, 512],
                scope="multihead_attention", 
                reuse=None):
    '''Point-wise feed forward net.
    
    Args:
      inputs: A 3d tensor with shape of [N, T, C].
      num_units: A list of two integers.
      scope: Optional scope for `variable_scope`.
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
        
    Returns:
      A 3d tensor with the same shape and dtype as inputs
    '''
    with tf.variable_scope(scope, reuse=reuse):
        # Inner layer
        params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,
                  "activation": tf.nn.relu, "use_bias": True}
        outputs = tf.layers.conv1d(**params)
        
        # Readout layer
        params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,
                  "activation": None, "use_bias": True}
        outputs = tf.layers.conv1d(**params)
        
        # Residual connection
        outputs += inputs
        
        # Normalize
        outputs = layer_normalization(outputs)
    
    return outputs

def label_smoothing(inputs, epsilon=0.1):
    '''Applies label smoothing. See https://arxiv.org/abs/1512.00567.
    
    Args:
      inputs: A 3d tensor with shape of [N, T, V], where V is the number of vocabulary.
      epsilon: Smoothing rate.
    
    For example,
    
    ```
    import tensorflow as tf
    inputs = tf.convert_to_tensor([[[0, 0, 1], 
       [0, 1, 0],
       [1, 0, 0]],
      [[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0]]], tf.float32)
       
    outputs = label_smoothing(inputs)
    
    with tf.Session() as sess:
        print(sess.run([outputs]))
    
    >>
    [array([[[ 0.03333334,  0.03333334,  0.93333334],
        [ 0.03333334,  0.93333334,  0.03333334],
        [ 0.93333334,  0.03333334,  0.03333334]],
       [[ 0.93333334,  0.03333334,  0.03333334],
        [ 0.93333334,  0.03333334,  0.03333334],
        [ 0.03333334,  0.93333334,  0.03333334]]], dtype=float32)]   
    ```    
    '''
    K = inputs.get_shape().as_list()[-1] # number of channels
    return ((1-epsilon) * inputs) + (epsilon / K)

In [63]:
import tensorflow as tf
from tensorflow.contrib.layers import fully_connected, conv2d
from tensorflow.contrib.layers import conv2d_transpose, flatten

from hedgeable_ai.models.nn import BaseModel
        
class FeedForward(BaseModel):
    def __init__(self, model_params, scope_name, *args, **kwargs):
        super().__init__(model_params, scope_name, *args, **kwargs)
    
    def __call__(self, x, training=True):
        with tf.variable_scope(self.scope_name, reuse=self.reuse):
            for i, params in enumerate(self.model_params):
                with tf.variable_scope('layer_' + str(i)):
                    if "is_flatten" in params and params["is_flatten"]:
                        x = flatten(x)
                    if "drop_rate" in params:
                        x = tf.layers.dropout(x, rate=params["drop_rate"], training=training)
                    # demtermine which layer to use
                    if params["name"] == "dense":
                        x = fully_connected(x, params["num_hidden"], activation_fn=None)
                    elif params["name"] == "conv2d":
                        x =  conv2d(x, params["num_filter"], params["kernel_size"],
                                          params["stride"], params["padding"], activation_fn=None)
                    elif params["name"] == "deconv2d":
                        x =  conv2d_transpose(x, params["num_filter"], params["kernel_size"],
                                          params["stride"], params["padding"], activation_fn=None)
                    elif params["name"] == "reshape":
                        x = tf.reshape(x, (-1,) + params["reshape_size"])
                    elif params["name"] == "pooling":
                        del params["name"]
                        x = tf.nn.pool(x, **params)
                    elif params["name"] == None:
                        pass
                    else:
                        raise NotImplementedError("No implementation for 'name'={}".format(params["name"]))         
                    if "is_batch" in params and params["is_batch"]:
                        x = tf.layers.batch_normalization(x, training=training, momentum=0.9)
                    if "activation" in params:
                        x = params["activation"](x)
            if self.reuse is False:
                self.global_scope_name = tf.get_variable_scope().name
                self.variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.global_scope_name)
        self.reuse = True
        return x

In [104]:
import tensorflow as tf
import tensorflow.contrib.rnn as rnn
import tensorflow.contrib.seq2seq as seq2seq
from hedgeable_ai.models.nn import BaseModel, get_shape, get_length
import tensorflow.contrib.rnn as rnn

import tensorflow as tf

from hedgeable_ai.models.nn.params import nn_is_logit
from hedgeable_ai.models.nn import BaseNN, get_shape

from hedgeable_ai.models.nn.rnn import get_cell

count = 0
class DialogueAgent(BaseNN):
    def __init__(self, processor, emb_size=300, maxlen=20,
                 conf=None, additional_length=3, clip_val=5.0, 
                 num_hidden=512, num_heads=8, num_blocks=6, drop_rate=0.2,
                 position_scale=1000, *args, **kwargs):
        self.emb_size = emb_size
        self.clip_val = clip_val
        self.num_blocks = num_blocks
        # leave index 0 for padding and 1 for  <eos>
        self.vocab_size = processor.vocab_size + 2
        self.maxlen = maxlen
        self.additional_length = additional_length
        self.position_scale = position_scale
        self.drop_rate = drop_rate
        self.num_heads = num_heads
        self.num_hidden = num_hidden
        super().__init__(processor=processor, conf=conf, *args, **kwargs)
        
    def _build_graph(self):
        """Build tensorflow graph
        
        Note:
            You build graphs for output and input, which will be used 
            for training and prediction.
        """
        # Build Basic Netwoiork
        self.enc_input = tf.placeholder(tf.int32, shape=(None, None), name="encoder_input")
        self.dec_input = tf.placeholder(tf.int32, shape=(None, None), name="decoder_input")
        self.dec_target = tf.placeholder(tf.int32, shape=(None, None), name="decoder_target")
        self.training = tf.placeholder(tf.bool, (), name="trainig")
        enc_length = get_length(self.enc_input)
        dec_length = get_length(self.dec_target)
        tensor_batch_size = tf.shape(self.enc_input)[0]
        # Encoder
        with tf.variable_scope("encoder"):
            ## Embedding
            enc_embeddings = tf.get_variable("embedding", [self.vocab_size, self.num_hidden],
                                     initializer=tf.contrib.layers.xavier_initializer())
            enc_input_embedded = tf.nn.embedding_lookup(enc_embeddings, self.enc_input)
            ## Positional Encoding
            position_idx = tf.range(tf.reduce_max(enc_length))
            position_idx = tf.tile(tf.expand_dims(position_idx, 0), [tensor_batch_size, 1])
            enc_position_embeddings = tf.get_variable("embedding_position", [self.vocab_size, self.num_hidden],
                                     initializer=tf.contrib.layers.xavier_initializer())
            enc_position_embedded = tf.nn.embedding_lookup(enc_position_embeddings, position_idx)
            enc_x = enc_input_embedded + enc_position_embedded
            ## Dropout
            enc_x = tf.layers.dropout(enc_x, rate=self.drop_rate, training=self.training)
            # Encoder blocks    
            for i in range(self.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    ### Multihead Attention
                    enc_x = multihead_attention(queries=enc_x, 
                                                keys=enc_x, 
                                                values=enc_x,
                                                num_units=self.num_hidden,
                                                num_heads=self.num_heads, 
                                                drop_rate=self.drop_rate,
                                                training=self.training,
                                                causality=False,
                                                scope="self_attention")
                    ### Feed Forward
                    enc_x = feedforward(enc_x, num_units=[4*self.num_hidden, self.num_hidden])
            
        # Decoder
        with tf.variable_scope("decoder"):
            ## Embedding
            dec_embeddings = tf.get_variable("embedding", [self.vocab_size, self.num_hidden],
                                     initializer=tf.contrib.layers.xavier_initializer())
            dec_input_embedded = tf.nn.embedding_lookup(dec_embeddings, self.dec_input)
            ## Positional Encoding
            position_idx = tf.range(tf.reduce_max(dec_length))
            position_idx = tf.tile(tf.expand_dims(position_idx, 0), [tensor_batch_size, 1])
            dec_position_embeddings = tf.get_variable("embedding_position", [self.vocab_size, self.num_hidden],
                                     initializer=tf.contrib.layers.xavier_initializer())
            dec_position_embedded = tf.nn.embedding_lookup(dec_position_embeddings, position_idx)
            dec_x = dec_input_embedded + dec_position_embedded
            ## Dropout
            dec_x = tf.layers.dropout(dec_x, rate=self.drop_rate, training=self.training)
                
            ## Blocks
            for i in range(self.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    ### Multihead Attention
                    dec_x = multihead_attention(queries=dec_x, 
                                                keys=dec_x, 
                                                values=dec_x,
                                                num_units=self.num_hidden,
                                                num_heads=self.num_heads, 
                                                drop_rate=self.drop_rate,
                                                training=self.training,
                                                causality=True,
                                                scope="self_attention")
                        
                    dec_x = multihead_attention(queries=dec_x, 
                                                keys=enc_x, 
                                                values=enc_x,
                                                num_units=self.num_hidden,
                                                num_heads=self.num_heads, 
                                                drop_rate=self.drop_rate,
                                                training=self.training,
                                                causality=False,
                                                scope="vanilla_attention")
                    ### Feed Forward
                    dec_x = feedforward(dec_x, num_units=[4*self.num_hidden, self.num_hidden])
                
        # Final linear projection
        self.logits = tf.layers.dense(dec_x, self.vocab_size)
        self.predictions = tf.cast(tf.arg_max(self.logits, dimension=-1), tf.int32)
        target_smoothed = label_smoothing(tf.one_hot(self.dec_target, depth=self.vocab_size))
        _loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=target_smoothed)
        self.loss = tf.reduce_mean(tf.reduce_sum(_loss, [1]))
        
        self.learning_rate_op = self._get_learning_rate()
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            self.optimizer = self._get_optimizer(self.optimizer_name, self.learning_rate_op, self.optimizer_conf)
            grads_vars = self.optimizer.compute_gradients(self.loss)
            clipped_grads_vars = [
                (tf.clip_by_norm(gv[0], clip_norm=self.clip_val), gv[1]) 
                for gv in grads_vars]
            self.train_step = self.optimizer.apply_gradients(clipped_grads_vars)
        
    def _optimize(self, batch_X, batch_y, *args, **kwargs):
        global count
        batch_X = batch_X[0]
        batch_X, Xlen = self.processor.batch_padding(batch_X, self.maxlen)
        length = np.max(Xlen) + self.additional_length
        batch_y = self._batch_padding(batch_y, length)
        input_y = batch_y[:, :-1]
        target_y = batch_y[:, 1:]
        feed_dict = {self.enc_input: batch_X,
                     self.dec_input: input_y,
                     self.dec_target: target_y,
                     self.training: True}
        
        _, loss = self.sess.run([self.train_step, self.loss], feed_dict=feed_dict)
        # print("loss", loss)
        if count % 1000 == 0:
            sentences = ["how are you?", "what is your name?", "you should kill youself."]
            predictions = self.generate_sentences(sentences)
            print(predictions)
        count += 1
        return loss
    
    def generate_sentences(self, sentences):
        X = [self.processor.encode(sentence) for sentence in sentences]
        # X = sentences
        X = [x_[::-1] for x_ in X]
        X, Xlen = self.processor.batch_padding(X, self.maxlen)
        batch_size = X.shape[0]
        y = np.ones((batch_size,1), dtype=int)
        for i in range(self.maxlen):
            feed_dict = {self.enc_input: X,
                         self.dec_input: y,
                         self.dec_target:y,
                         self.training: False}
            predictions = self.sess.run(self.predictions, feed_dict=feed_dict)
            new_y = predictions[:, -1]
            y = np.concatenate((y, new_y[:, np.newaxis]), axis=1)
            finished = np.array(new_y > 1, dtype=int)
            if np.sum(finished) == 0:
                break
        return [self.processor.decode(i) for i in y]
    
    def _batch_padding(self, batch, length):
        EOS = 1
        PAD = 0
        padded_batch = []
        for x in batch:
            x = list(x)
            if len(x) > length:
                x = x[:length]
            elif len(x) < length:
                x.append(EOS)
            while len(x) < length:
                x.append(PAD)
            padded_batch.append(x)
        return np.array(padded_batch)

In [None]:
import tensorflow as tf
import numpy as np


conf = {
        "learning_rate": 1e-1,
        "learning_rate_minimum": 1e-3,
        "learning_rate_decay": 0.99,
        "learning_rate_decay_step": 1,
        "batch_size": 128,
        "model_dir": "./logs",
        "load_file_path": None,
        "save_file_path": None,
        "log_freq": 1,
        "optimizer":"gd",
        "model":[{"name":"lstm", "num_units":512},
                {"name":"lstm", "num_units":512},],
        # "model":[{"name":"lstm", "num_units":500},
        #         {"name":"lstm", "num_units":500}],
        "attention_size": 512
}

tf.reset_default_graph()
agent = DialogueAgent(processor, maxlen=20, conf=conf, additional_length=3)

train_X = processor.data[:-1]
train_y = processor.data[1:]
agent.fit(train_X, train_y, num_epochs=10000, batch_bar=False, log_freq=1, batch_log_freq=100)



  0%|          | 0/10000 [00:00<?, ?it/s]

Model saved in file: params/model.ckpt


[A[A

[[], [], []]




  0%|          | 1/10000 [03:29<582:46:32, 209.82s/it][A[A

[['.', 'the'], ['.', 'the'], ['.', 'the']]




  0%|          | 2/10000 [07:02<585:23:16, 210.78s/it][A[A

  0%|          | 3/10000 [10:36<587:51:56, 211.70s/it][A[A

[['i', 'i'], ['i', 'i'], ['i', 'i']]




  0%|          | 4/10000 [14:11<590:28:04, 212.65s/it][A[A

  0%|          | 5/10000 [17:46<592:06:03, 213.26s/it][A[A

[['is'], ['is'], ['is']]




  0%|          | 6/10000 [21:21<593:35:52, 213.82s/it][A[A

  0%|          | 7/10000 [24:56<594:32:40, 214.19s/it][A[A

[[], [], []]




  0%|          | 8/10000 [28:31<595:24:24, 214.52s/it][A[A

  0%|          | 9/10000 [32:06<595:57:40, 214.74s/it][A[A

[[], [], []]




  0%|          | 10/10000 [35:42<596:30:33, 214.96s/it][A[A

  0%|          | 11/10000 [39:17<596:48:49, 215.09s/it][A[A

[[], [], []]




  0%|          | 12/10000 [42:53<597:09:31, 215.24s/it][A[A

  0%|          | 13/10000 [46:28<597:19:42, 215.32s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 14/10000 [50:04<597:22:12, 215.35s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 15/10000 [53:40<597:43:54, 215.51s/it][A[A

  0%|          | 16/10000 [57:16<597:58:26, 215.62s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 17/10000 [1:00:52<598:19:28, 215.76s/it][A[A

  0%|          | 18/10000 [1:04:28<598:19:20, 215.78s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 19/10000 [1:08:04<598:25:49, 215.85s/it][A[A

  0%|          | 20/10000 [1:11:40<598:30:17, 215.89s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 21/10000 [1:15:16<598:33:21, 215.93s/it][A[A

  0%|          | 22/10000 [1:18:52<598:41:51, 216.01s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 23/10000 [1:22:28<598:54:43, 216.11s/it][A[A

  0%|          | 24/10000 [1:26:04<599:02:43, 216.18s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 25/10000 [1:29:41<599:01:22, 216.19s/it][A[A

  0%|          | 26/10000 [1:33:17<598:51:53, 216.15s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 27/10000 [1:36:53<599:01:58, 216.24s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 28/10000 [1:40:29<599:05:44, 216.28s/it][A[A

  0%|          | 29/10000 [1:44:06<599:12:30, 216.34s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 30/10000 [1:47:43<599:22:15, 216.42s/it][A[A

  0%|          | 31/10000 [1:51:19<599:20:59, 216.44s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 32/10000 [1:54:56<599:22:58, 216.47s/it][A[A

  0%|          | 33/10000 [1:58:32<599:22:40, 216.49s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 34/10000 [2:02:09<599:38:13, 216.61s/it][A[A

  0%|          | 35/10000 [2:05:45<599:21:54, 216.53s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 36/10000 [2:09:22<599:35:51, 216.64s/it][A[A

  0%|          | 37/10000 [2:12:59<599:39:17, 216.68s/it][A[A

[['i'], ['i'], ['i']]




  0%|          | 38/10000 [2:16:36<599:46:03, 216.74s/it][A[A

  0%|          | 39/10000 [2:20:13<599:39:04, 216.72s/it][A[A

[[], [], []]




  0%|          | 40/10000 [2:23:50<599:51:26, 216.82s/it][A[A

[[], [], []]




  0%|          | 41/10000 [2:27:26<599:38:57, 216.76s/it][A[A

  0%|          | 42/10000 [2:31:03<599:41:14, 216.80s/it][A[A

[[], [], []]




  0%|          | 43/10000 [2:34:40<599:54:53, 216.90s/it][A[A

  0%|          | 44/10000 [2:38:17<599:58:32, 216.95s/it][A[A

[[], [], []]




  0%|          | 45/10000 [2:41:55<600:11:20, 217.04s/it][A[A

  0%|          | 46/10000 [2:45:31<599:55:04, 216.97s/it][A[A

[[], [], []]




  0%|          | 47/10000 [2:49:08<599:56:29, 217.00s/it][A[A

  0%|          | 48/10000 [2:52:45<599:53:18, 217.00s/it][A[A

[[], [], []]




  0%|          | 49/10000 [2:56:23<599:52:20, 217.02s/it][A[A

  0%|          | 50/10000 [3:00:00<599:55:07, 217.06s/it][A[A

[[], [], []]




  1%|          | 51/10000 [3:03:37<599:57:09, 217.09s/it][A[A

  1%|          | 52/10000 [3:07:14<600:03:30, 217.15s/it][A[A

[[], [], []]




  1%|          | 53/10000 [3:10:52<600:10:58, 217.22s/it][A[A

  1%|          | 54/10000 [3:14:29<600:12:35, 217.25s/it][A[A

[[], [], []]




  1%|          | 55/10000 [3:18:06<600:23:51, 217.34s/it][A[A

[[], [], []]




  1%|          | 56/10000 [3:21:44<600:26:50, 217.38s/it][A[A

  1%|          | 57/10000 [3:25:21<600:20:15, 217.36s/it][A[A

[[], [], []]




  1%|          | 58/10000 [3:28:59<600:16:48, 217.36s/it][A[A

  1%|          | 59/10000 [3:32:36<600:11:03, 217.35s/it][A[A

[[], [], []]




  1%|          | 60/10000 [3:36:13<600:15:48, 217.40s/it][A[A

  1%|          | 61/10000 [3:39:51<600:04:17, 217.35s/it][A[A

[[], [], []]




  1%|          | 62/10000 [3:43:28<600:19:06, 217.46s/it][A[A

  1%|          | 63/10000 [3:47:06<600:15:04, 217.46s/it][A[A

[[], [], []]




  1%|          | 64/10000 [3:50:43<599:56:42, 217.37s/it][A[A

  1%|          | 65/10000 [3:54:21<600:05:42, 217.45s/it][A[A

[[], [], []]




  1%|          | 66/10000 [3:57:58<600:08:01, 217.48s/it][A[A

  1%|          | 67/10000 [4:01:36<599:58:30, 217.45s/it][A[A

[[], [], []]




  1%|          | 68/10000 [4:05:13<600:09:38, 217.54s/it][A[A

[[], [], []]




  1%|          | 69/10000 [4:08:51<600:03:07, 217.52s/it][A[A

  1%|          | 70/10000 [4:12:28<600:06:43, 217.56s/it][A[A

[[], [], []]




  1%|          | 71/10000 [4:16:06<600:11:57, 217.62s/it][A[A

  1%|          | 72/10000 [4:19:44<600:21:20, 217.70s/it][A[A

[[], [], []]




  1%|          | 73/10000 [4:23:22<600:22:48, 217.73s/it][A[A

  1%|          | 74/10000 [4:27:00<600:24:04, 217.76s/it][A[A

[[], [], []]




  1%|          | 75/10000 [4:30:38<600:28:34, 217.81s/it][A[A

  1%|          | 76/10000 [4:34:15<600:21:08, 217.78s/it][A[A

[[], [], []]




  1%|          | 77/10000 [4:37:53<600:25:56, 217.83s/it][A[A

  1%|          | 78/10000 [4:41:31<600:21:38, 217.83s/it][A[A

[[], [], []]




  1%|          | 79/10000 [4:45:09<600:20:51, 217.85s/it][A[A

  1%|          | 80/10000 [4:48:47<600:14:12, 217.83s/it][A[A

[[], [], []]




  1%|          | 81/10000 [4:52:24<600:04:17, 217.79s/it][A[A

[[], [], []]




  1%|          | 82/10000 [4:56:02<600:09:29, 217.84s/it][A[A

  1%|          | 83/10000 [4:59:40<600:16:18, 217.91s/it][A[A

[[], [], []]




  1%|          | 84/10000 [5:03:19<600:22:27, 217.97s/it][A[A

  1%|          | 85/10000 [5:06:57<600:26:20, 218.01s/it][A[A

[[], [], []]




  1%|          | 86/10000 [5:10:35<600:36:57, 218.10s/it][A[A

  1%|          | 87/10000 [5:14:13<600:29:36, 218.07s/it][A[A

[[], [], []]




  1%|          | 88/10000 [5:17:52<601:04:00, 218.31s/it][A[A

  1%|          | 89/10000 [5:21:30<601:00:52, 218.31s/it][A[A

[[], [], []]




  1%|          | 90/10000 [5:25:09<601:09:03, 218.38s/it][A[A

  1%|          | 91/10000 [5:28:47<600:37:50, 218.21s/it][A[A

[[], [], []]




  1%|          | 92/10000 [5:32:25<600:27:52, 218.17s/it][A[A

  1%|          | 93/10000 [5:36:03<600:25:30, 218.18s/it][A[A

[[], [], []]




  1%|          | 94/10000 [5:39:41<600:23:43, 218.19s/it][A[A

In [None]:
prediction = agent.generate_sentences(texts[:10])

In [100]:
print(prediction)

[['anyhow', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse'], ['aloha', 'aloha', 'flushing', 'flushing', 'flushing', 'flushing', 'flushing', 'aloha', 'flushing', 'aloha'], ['legitimate', 'best-known', 'best-known', 'best-known', 'best-known', 'best-known', 'best-known', 'best-known', 'best-known', 'best-known'], ['ended', 'chi-chis', 'chi-chis', 'chi-chis', 'chi-chis', 'chi-chis', 'chi-chis', 'chi-chis', 'chi-chis', 'chi-chis'], ['warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse'], ['legitimate', 'legitimate', 'legitimate', 'legitimate', 'legitimate', 'legitimate', 'legitimate', 'legitimate', 'legitimate', 'legitimate'], ['anyhow', 'anyhow', 'anyhow', 'anyhow', 'anyhow', 'anyhow', 'anyhow', 'anyhow', 'anyhow', 'anyhow'], ['warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'warehouse', 'ware

In [75]:
processor.vocab_size

17960

In [89]:
texts[0]

'You guys, you guys! Chef is going away. \n'