In [1]:
import pandas as pd

df = pd.read_csv("data/train.csv")
df = pd.read_csv("data/train.csv")
df = df.dropna()
X1 = df["question1"].values
X2 = df["question2"].values
y = df["is_duplicate"].values
X= [X1, X2]

In [2]:
import numpy as np

def batch(inputs, max_sequence_length=None):
    """
    Args:
        inputs:
            list of sentences (integer lists)
        max_sequence_length:
            integer specifying how large should `max_time` dimension be.
            If None, maximum sequence length would be used
    
    Outputs:
        inputs_time_major:
            input sentences transformed into time-major matrix 
            (shape [max_time, batch_size]) padded with 0s
        sequence_lengths:
            batch-sized list of integers specifying amount of active 
            time steps in each input sequence
    """
    
    sequence_lengths = [len(seq) for seq in inputs]
    batch_size = len(inputs)
    
    if max_sequence_length is None:
        max_sequence_length = max(sequence_lengths)
    
    inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32) # == PAD
    
    for i, seq in enumerate(inputs):
        for j, element in enumerate(seq):
            inputs_batch_major[i, j] = element

    # [batch_size, max_time] -> [max_time, batch_size]
    # inputs_time_major = inputs_batch_major.swapaxes(0, 1)

    return inputs_batch_major, sequence_lengths


def random_sequences(length_from, length_to,
                     vocab_lower, vocab_upper,
                     batch_size):
    """ Generates batches of random integer sequences,
        sequence length in [length_from, length_to],
        vocabulary in [vocab_lower, vocab_upper]
    """
    if length_from > length_to:
            raise ValueError('length_from > length_to')

    def random_length():
        if length_from == length_to:
            return length_from
        return np.random.randint(length_from, length_to + 1)
    
    while True:
        yield [
            np.random.randint(low=vocab_lower,
                              high=vocab_upper,
                              size=random_length()).tolist()
            for _ in range(batch_size)
        ]

In [28]:
import tensorflow as tf
import tensorflow.contrib.rnn as rnn
import tensorflow.contrib.seq2seq as seq2seq
from hedgeable_ai.models.nn import BaseModel, get_shape, get_length

import tensorflow as tf

from hedgeable_ai.models.nn.params import nn_is_logit
from hedgeable_ai.models.nn import BaseNN, get_shape

from hedgeable_ai.models.nn.rnn import get_cell


class DialogueAgent(BaseNN):
    def __init__(self, processor, conf=None, *args, **kwargs):
        self.emb_size = 300
        # add padding index 0 and index 1 for <eos>
        self.vocab_size = processor.vocab_size + 2
        super().__init__(processor=processor, conf=conf, *args, **kwargs)
        
    def _build_graph(self):
        """Build tensorflow graph
        
        Note:
            You build graphs for output and input, which will be used 
            for training and prediction.
        """
        self.encoder_input = tf.placeholder(tf.int32, shape=(None, None), name="encoder_input")
        self.encoder_input_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_input_length')
        self.decoder_target = tf.placeholder(tf.int32, shape=(None, None), name="decoder_target")
        
        # Encoder
        encoder_cell = get_cell(self.conf["model_encoder"])
        embeddings = tf.Variable(tf.random_uniform([self.vocab_size, self.emb_size], -1.0, 1.0, dtype=tf.float32))
        encoder_input_embedded = tf.nn.embedding_lookup(embeddings, self.encoder_input)
        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
            encoder_cell, encoder_input_embedded, dtype=tf.float32, 
            time_major=False, scope="encoder")
        # We do not need encoder outputs
        del encoder_outputs
        
        # Decoder
        decoder_cell = get_cell(self.conf["model_decoder"])
        batch_size, encoder_max_time = tf.unstack(tf.shape(self.encoder_input))
        decoder_output_size = self._get_output_size(self.conf["model_decoder"])
        # We use a decoder length 2 words longer than encoder
        decoder_length = self.encoder_input_length + 3
        W = tf.Variable(tf.random_uniform([decoder_output_size, self.vocab_size], -1, 1), dtype=tf.float32)
        b = tf.Variable(tf.zeros([self.vocab_size]), dtype=tf.float32)
        # Prepare for padding and EOS
        eos_time_slice = tf.ones([batch_size], dtype=tf.int32, name='EOS')
        pad_time_slice = tf.zeros([batch_size],  dtype=tf.int32, name='PAD')
        eos_step_embedded = tf.nn.embedding_lookup(embeddings, eos_time_slice)
        pad_step_embedded = tf.nn.embedding_lookup(embeddings, pad_time_slice)
        
        def loop_fn_initial():
            initial_elements_finished = (0 >= decoder_length)
            initial_input = eos_step_embedded
            initial_cell_state = encoder_final_state
            initial_cell_output  = None
            initial_loop_state = None
            return (initial_elements_finished,
                initial_input,
                initial_cell_state,
                initial_cell_output,
                initial_loop_state)
        
        def loop_fn_transition(time, previous_output,  previous_state, previous_loop_state):
            
            def get_next_input():
                output_logits = tf.add(tf.matmul(previous_output, W), b)
                prediction  = tf.argmax(output_logits, axis=1)
                next_input = tf.nn.embedding_lookup(embeddings,  prediction)
                return next_input
            
            elements_finished = (time >= decoder_length)
            finished = tf.reduce_all(elements_finished)
            input_ = tf.cond(finished, lambda: pad_step_embedded, get_next_input)
            state  = previous_state
            output = previous_output
            loop_state = None
            return (elements_finished, 
                input_,
                state,
                output,
                loop_state)
        
        def loop_fn(time, previous_output, previous_state,  previous_loop_state):
            if previous_state is None:
                assert previous_output is None and previous_state is None
                return loop_fn_initial()
            else:
                return loop_fn_transition(time, previous_output, previous_state,  previous_loop_state)
        
        decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn(decoder_cell, loop_fn)
        decoder_outputs = decoder_outputs_ta.stack()
        decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack(tf.shape(decoder_outputs))
        decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_output_size))
        decoder_logits_flat = tf.add(tf.matmul(decoder_outputs_flat, W), b)
        decoder_logits = tf.reshape(decoder_logits_flat, (decoder_max_steps, decoder_batch_size, self.vocab_size))
        decoder_logits = tf.transpose(decoder_logits, [1, 0, 2])
        self.decoder_prediction = tf.argmax(decoder_logits, 2)
        
        # Optimization
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.one_hot(self.decoder_target, depth=self.vocab_size, dtype=tf.float32),
            logits=decoder_logits)
        self.loss = tf.reduce_mean(cross_entropy)
        self.learning_rate_op = self._get_learning_rate()
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            self.optimizer = self._get_optimizer(self.optimizer_name, self.learning_rate_op, self.optimizer_conf)
            self.train_step = self.optimizer.minimize(self.loss)
        
    def _optimize(self, batch_X, batch_y, *args, **kwargs):
        batch_X, Xlen = batch(batch_X[0])
        length = np.max(Xlen) + 3
        batch_y = self._batch_padding(batch_y, length)
        feed_dict = {self.encoder_input: batch_X,
                     self.decoder_target: batch_y,
                     self.encoder_input_length: Xlen,
                     self.training: True}
        _, loss = self.sess.run([self.train_step, self.loss], feed_dict=feed_dict)
        return loss
    
    def _get_output_size(self, conf):
        if isinstance(conf, list) or isinstance(conf, tuple):
            x = conf[-1]
        else:
            x = conf
        return x["num_units"]
    
    def generate_sentences(self, sentences):
        words = [nltk.word_tokenize(sentence) for sentence in sentences]
        X = [self.processor.encode(word) for word in words]
        X, Xlen = batch(X)
        feed_dict = {self.encoder_input: X,
                     self.encoder_input_length: Xlen,
                     self.training: False}
        word_idx = self.sess.run(self.decoder_prediction, feed_dict=feed_dict)
        return [self.processor.decode(i) for i in word_idx]
    
    def _batch_padding(self, batch, length):
        EOS = 1
        PAD = 0
        padded_batch = []
        for x in batch:
            x = list(x)
            if len(x) < length:
                x.append(EOS)
            while len(x) < length:
                x.append(PAD)
            padded_batch.append(x)
        return np.array(padded_batch)

In [33]:
from sklearn import preprocessing
import nltk

class BasicProcessor(object):
    """Process data for estimators."""
        
    def batch_process(self, X, y=None):
        """Make sure to have numpy data for input and target"""
        if y is None:
            return np.array(X)
        else:
            return np.array(X), np.array(y)

    def batch_process_y(self, y):
        return np.array(y)
    
class Word2IndexProcessor(BasicProcessor):
    def __init__(self,  texts):
        _texts = []
        lengths = []
        for text in texts:
            words = nltk.word_tokenize(text)
            _texts.extend(words)
            lengths.append(len(words))
        lengths = list(np.cumsum(lengths))
        lengths.insert(0, 0)
        self.encoder = preprocessing.LabelEncoder()
        indices = self.encoder.fit_transform(_texts)
        # split to sentences
        self.data = np.array([indices[lengths[i]:lengths[i+1]] for i in range(len(lengths) - 1)])
        
    def encode(self, text):
        text_ = [x for x in text if x in self.encoder.classes_]
        # text_ = text.split()
        return self.encoder.transform(text_) + 2
    
    def decode(self, index):
        return [self.encoder.inverse_transform(i-2) for i in index if i >=2]    
    
    def batch_process(self, X, y=None):
        X, xlen = batch(X)
        if y is None:
            return np.array(X)
        else:
            y, ylen = batch(y)
            return np.array(X), np.array(y)
        
    def batch_process_y(self, y):
        y, ylen = batch(y)
        return np.array(y)
            
    
    def batch_process_test(self, X, y=None):
        if y is None:
            return np.array([self.encode(x_i) for x_i in X])
        else:
            return np.array([self.encode(x_i) for x_i in X]), np.array(y)
        
    @property
    def vocab_size(self):
        return len(self.encoder.classes_)

In [34]:
%%time

processor = Word2IndexProcessor(X[0][:100])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 13.7 ms


In [35]:
import tensorflow as tf

conf = {
        "learning_rate": 1e-4,
        "learning_rate_minimum": 1e-5,
        "learning_rate_decay": 0.9,
        "learning_rate_decay_step": 10,
        "batch_size": 128,
        "model_dir": "./logs",
        "load_file_path": None,
        "save_file_path": None,
        "log_freq": 1,
        "model_encoder":{"name":"lnlstm", "num_units":100},
        "model_decoder":{"name":"lnlstm", "num_units":100},
        
}

tf.reset_default_graph()
agent = DialogueAgent(processor, conf=conf)
train_X = processor.data[:50]
train_y = processor.data[50:100]
agent.fit(train_X, train_y, num_epochs=5)

  0%|          | 0/5 [00:00<?, ?it/s]

Model saved in file: params/model.ckpt


100%|██████████| 5/5 [00:01<00:00,  2.56it/s]


Model saved in file: params/model.ckpt


In [36]:
agent.generate_sentences(X[0][:10])

[['play',
  'political',
  'play',
  'abstract',
  'political',
  'political',
  's',
  'questions',
  'Child',
  'review',
  'Germany',
  'than',
  'guide',
  'guide',
  'status',
  'ring',
  'effects',
  'States'],
 ['email',
  'hack',
  'reply',
  'immunity',
  'lightning',
  'F1',
  'hack',
  'States',
  'not',
  'not',
  'aircraft',
  'aircraft',
  'States',
  'kickass'],
 ['immunity',
  'F1',
  'hack',
  'lightning',
  'tips',
  'eaten',
  'nose',
  'share',
  'share',
  'horcrux',
  'jealous',
  'Maul',
  '60k',
  'my',
  'dance',
  '60k',
  'back',
  'log'],
 ['email',
  'hack',
  'remember',
  'immunity',
  'the',
  'not',
  'not',
  'Rohingya',
  'not',
  'people',
  'master',
  'master',
  'master',
  'master',
  'trading',
  'green'],
 ['play',
  'the',
  'f-14',
  'immunity',
  'salary',
  'better',
  'manipulation',
  'earthquake',
  'after',
  'How',
  'Warrior',
  'Warrior',
  'buy',
  'confirmation',
  'experience',
  'manager',
  'oxide',
  'Ray',
  'abstract'],
 ['a'

In [20]:
train_X[:10]

array([ array([133, 331, 513, 497, 194, 497, 300, 524, 329, 319, 474, 372, 319,
       322,  24]),
       array([133, 331, 513, 500, 400,  82,   5,  81,   6,  50,  24]),
       array([ 67, 197,  68, 321, 513, 490, 400, 387, 326, 219, 553, 536, 143,
       129,  24]),
       array([138, 154,  68, 378, 537, 354,  24,  67, 197,  68, 483, 333,  24]),
       array([136, 406, 239, 319, 544, 440, 506,   7, 469,   7, 380, 156, 200,
       233, 411,  24]),
       array([ 28,  22,  68, 154, 143,  40, 117,  39, 383, 156, 199, 462,  10,
       551, 241, 512, 471, 144, 375,  24]),
       array([112,  68, 193, 521,  24]),
       array([ 67, 197,  68, 174, 143, 296, 289,  24]),
       array([134, 240, 565, 534, 569, 323, 400, 568,  24]),
       array([ 91,   5, 216,   6,  22,  37,  68, 303, 387,  42,  92,  47,  24])], dtype=object)

In [32]:
import nltk

sentence = nltk.word_tokenize(X[0][0])

In [33]:
sentence

['What',
 'is',
 'the',
 'step',
 'by',
 'step',
 'guide',
 'to',
 'invest',
 'in',
 'share',
 'market',
 'in',
 'india',
 '?']

In [62]:
%%time
index = processor.batch_process(processor.data[:10])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 263 µs


In [64]:
index.shape

(10, 21)

In [53]:
xt, xlen = batch(index)

In [55]:
xlen

[16, 12, 16, 14, 17, 21, 6, 9, 10, 14]

In [2]:
import tensorflow.contrib.seq2seq as seq2seq

In [3]:
seq2seq.basic_rnn_seq2seq

AttributeError: module 'tensorflow.contrib.seq2seq' has no attribute 'basic_rnn_seq2seq'

In [11]:
a = list(range(10))
a.insert(0, 0)

In [12]:
a

[0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]