In [1]:
import pandas as pd

df = pd.read_csv("data/train.csv")
df = pd.read_csv("data/train.csv")
df = df.dropna()
X1 = df["question1"].values
X2 = df["question2"].values
y = df["is_duplicate"].values
X= [X1, X2]

In [2]:
import numpy as np

def batch(inputs, max_sequence_length=None):
    """
    Args:
        inputs:
            list of sentences (integer lists)
        max_sequence_length:
            integer specifying how large should `max_time` dimension be.
            If None, maximum sequence length would be used
    
    Outputs:
        inputs_time_major:
            input sentences transformed into time-major matrix 
            (shape [max_time, batch_size]) padded with 0s
        sequence_lengths:
            batch-sized list of integers specifying amount of active 
            time steps in each input sequence
    """
    
    sequence_lengths = [len(seq) for seq in inputs]
    batch_size = len(inputs)
    
    if max_sequence_length is None:
        max_sequence_length = max(sequence_lengths)
    
    inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32) # == PAD
    
    for i, seq in enumerate(inputs):
        for j, element in enumerate(seq):
            inputs_batch_major[i, j] = element

    # [batch_size, max_time] -> [max_time, batch_size]
    # inputs_time_major = inputs_batch_major.swapaxes(0, 1)

    return inputs_batch_major, sequence_lengths


def random_sequences(length_from, length_to,
                     vocab_lower, vocab_upper,
                     batch_size):
    """ Generates batches of random integer sequences,
        sequence length in [length_from, length_to],
        vocabulary in [vocab_lower, vocab_upper]
    """
    if length_from > length_to:
            raise ValueError('length_from > length_to')

    def random_length():
        if length_from == length_to:
            return length_from
        return np.random.randint(length_from, length_to + 1)
    
    while True:
        yield [
            np.random.randint(low=vocab_lower,
                              high=vocab_upper,
                              size=random_length()).tolist()
            for _ in range(batch_size)
        ]

In [8]:
import tensorflow as tf
import tensorflow.contrib.rnn as rnn
import tensorflow.contrib.seq2seq as seq2seq
from hedgeable_ai.models.nn import BaseModel, get_shape, get_length

import tensorflow as tf

from hedgeable_ai.models.nn.params import nn_is_logit
from hedgeable_ai.models.nn import MultiNN, get_shape



class DialogueAgent(MultiNN):
    def __init__(self, processor, *args, **kwargs):
        self.emb_size = 300
        # add padding index 0
        self.vocab_size = processor.vocab_size + 1
        super().__init__(processor=processor, *args, **kwargs)
   
        
    def _build_graph(self):
        """Build tensorflow graph
        
        Note:
            You build graphs for output and input, which will be used 
            for training and prediction.
        """
        self.encoder_input = tf.placeholder(tf.int32, shape=(None, None), name="encoder_input")
        self.encoder_input_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_input_length')
        self.decoder_input = tf.placeholder(tf.int32, shape=(None, None), name="decoder_input")
        self.target = tf.placeholder(tf.int32, shape=(None, None), name="target")
        word_embeddings = tf.get_variable("word_embeddings", [self.vocab_size, self.emb_size])
        embedded_encoder = tf.gather(word_embeddings, self.encoder_input)
        embedded_decoder = tf.gather(word_embeddings, self.decoder_input)
        #  Build Seq2Seq Model
        cell = [rnn.LSTMCell(512) for _ in range(3)]
        cell = rnn.MultiRNNCell(cell)
        
        # Embedding
        embeddings = tf.Variable(tf.random_uniform([self.vocab_size, self.emb_size], -1.0, 1.0, dtype=tf.float32))
        encoder_input_embedded = tf.nn.embedding_lookup(embeddings, self.encoder_input)
        decoder_input_embedded = tf.nn.embedding_lookup(embeddings, self.decoder_input)
        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
            cell, encoder_input_embedded, dtype=tf.float32, 
            time_major=False, scope="encoder")
        # We do not need encoder outputs
        del encoder_outputs
        
        # Decoder
        batch_size, encoder_max_time = tf.unstack(tf.shape(self.encoder_input))
        decoder_length = self.encoder_input_length + 3
        W = tf.Variable(tf.random_uniform([512, self.vocab_size], -1, 1), dtype=tf.float32)
        b = tf.Variable(tf.zeros([self.vocab_size]), dtype=tf.ffloat32)
        eos_time_slice = tf.ones([batch_size], dtype=tf.int32, name='EOS')
        pad_time_slice = tf.zeros([batch_size],  dtype=tf.int32, name='PAD')
        
        eos_step_embedded = tf.nn.embedding_lookup(embeddings, eos_time_slice)
        pad_step_embedded = tf.nn.embedding_lookup(embeddings, pad_time_slice)
        
        def loop_fn_initial():
            initial_elements_finished = (0 >= decoder_length)
            initial_input = eos_step_embedded
            initial_cell_state = encoder_final_state
            initial_cell_output  = None
            initial_loop_state = None
            return (initial_elements_finished,
                initial_input,
                initial_cell_state,
                initial_cell_output,
                initial_loop_state)
        
        def loop_fn_transition(time, previous_output,  previous_state, previous_loop_state):
            
            def get_next_input():
                output_logits - tf.add(tf.matmul(previous_output, W), b)
                prediction  = tf.argmax(output_logits,  axis=1)
                next_input = tf.nn.embedding_lookup(embeddings,  prediction)
                return next_input
            
            elements_finished = (time >= decoder_length)
            
            finished = tf.reduce_all(elements_finished)
            input_ = tf.cond(finished, lambda pad_step_embedded, get_next_input)
            state  = previous_state
            output =  previous_output
            loop_state = None
            return (elements_finished, 
                input_,
                state,
                output,
                loop_state)
        finished, next_input, next_cell_state, emit_output, next_loop_state
        
        def loop_fn(time, previous_output, previous_state,  previous_loop_state):
            if previous_state is None:
                assert previous_output is None and previous_state is None
                return loop_fn_initial()
            else:
                return loop_fn_transition(time, previous_output, previous_state,  previous_loop_state)
            
        
        decoder_outputs_ta, decoder_final_state = tf.nn.raw_rnn(cell, loop_fn)
        decoder_outputs = decoder_outputs_ta.stack()
        decoder_logits = tf.contrib.layers.fully_connected(
            decoder_outputs, self.vocab_size, activation_fn=None)
        decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack(tf.shape(decoder_outputs))
        decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_dim))
        decoder_logits_flat = tf.add(tf.matmul(decoder_outputs_flat, W), b)
        decoder_logits = tf.reshape(decoder_logits_flat, (decoder_max_steps, decoder_batch_size, vocab_size))
        self.decoder_prediction = tf.argmax(decoder_logits, 2)
        
        # Optimization
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.one_hot(self.target, depth=self.vocab_size, dtype=tf.float32),
            logits=decoder_logits)
        self.loss = tf.reduce_mean(cross_entropy)
        self.learning_rate_op = self._get_learning_rate()
        self.train_step =\
            tf.train.AdamOptimizer(self.learning_rate_op).minimize(self.loss)
        # Build tensorboad graph
        with tf.name_scope("summary"):
            self._build_summaries()
        
        # initialize graph
        self.sess.run(tf.global_variables_initializer())
        
    def _optimize(self, batch_X, batch_y, *args, **kwargs):
        feed_dict={
                self.encoder_input: batch_X[0],
                self.decoder_input: batch_X[1],
                self.training: True,
                self.target: batch_y}
        _, loss = self.sess.run([self.train_step, self.loss], feed_dict=feed_dict)
        return loss

In [9]:
from sklearn import preprocessing
import nltk

class BasicProcessor(object):
    """Process data for estimators."""
        
    def batch_process(self, X, y=None):
        """Make sure to have numpy data for input and target"""
        if y is None:
            return np.array(X)
        else:
            return np.array(X), np.array(y)

    def batch_process_y(self, y):
        return np.array(y)
    
class Word2IndexProcessor(BasicProcessor):
    def __init__(self,  texts):
        _texts = []
        for text in texts:
            words = nltk.word_tokenize(text)
            words.append("<eos>")
            _texts.extend(words)
        self.encoder = preprocessing.LabelEncoder()
        indices = self.encoder.fit_transform(_texts)
        eos_idx = self.encoder.transform(["<eos>"])[0]
        # split to sentences
        end_idx = [0,]
        for i, idx in enumerate(indices):
            if idx == eos_idx:
                end_idx.append(i+1)
        self.data = np.array([indices[end_idx[i]:end_idx[i+1]] for i in range(len(end_idx) - 1)])
        
    def _encode(self, text):
        text_ = [x for x in text if x in self.encoder.classes_]
        # text_ = text.split()
        return self.encoder.transform(text_) + 1
    
    def _decode(self, index):
        return [self.encoder.inverse_transform(i-1) for i in index]    
    
    def batch_process(self, X, y=None):
        X, xlen = batch(X)
        if y is None:
            return np.array(X)
        else:
            y, ylen = batch(y)
            return np.array(X), np.array(y)
        
    def batch_process_y(self, y):
        y, ylen = batch(y)
        return np.array(y)
            
    
    def batch_process_test(self, X, y=None):
        if y is None:
            return np.array([self._encode(x_i) for x_i in X])
        else:
            return np.array([self._encode(x_i) for x_i in X]), np.array(y)
        
    @property
    def vocab_size(self):
        return len(self.encoder.classes_)

In [13]:
%%time

processor = Word2IndexProcessor(X[0][:10000])

CPU times: user 1.02 s, sys: 8 ms, total: 1.02 s
Wall time: 1.01 s


In [15]:
import tensorflow as tf

conf = {
        "learning_rate": 1e-4,
        "learning_rate_minimum": 1e-5,
        "learning_rate_decay": 0.9,
        "learning_rate_decay_step": 10,
        "batch_size": 128,
        "model_dir": "./logs",
        "load_file_path": None,
        "save_file_path": None,
        "log_freq": 1,
        "model":[{"name": "conv2d", "kernel_size":(5, 1), "num_filter":32, "stride":(2, 1),
             "padding": 'valid', "is_batch":True, 'activation': tf.nn.relu},
            {"name": "conv2d", "kernel_size":(3, 1), "num_filter":64, "stride":(2, 1),
             "padding": 'valid', "is_batch":True, 'activation': tf.nn.relu},
                 {"name": "conv2d", "kernel_size":(3, 1), "num_filter":64, "stride":(2, 1),
             "padding": 'valid', "is_batch":True, 'activation': tf.nn.relu},
            {"name": "dense", "is_flatten":True, "is_batch":True, "num_hidden": 128, 'activation': tf.nn.relu},
        ],
        
}

tf.reset_default_graph()
agent = DialogueAgent(processor, conf=conf)
train_X = [processor.data[:50], processor.data[50:100]]
train_y = processor.data[50:100]
agent.fit(train_X, train_y)

  0%|          | 0/100 [00:00<?, ?it/s]

Model saved in file: params/model.ckpt


100%|██████████| 100/100 [00:15<00:00,  6.65it/s]


Model saved in file: params/model.ckpt


In [8]:
hasattr(processor, "score")

False

In [69]:
np.array(train_X[0])[[1, 3, 5]]

array([array([134, 332, 514, 501, 401,  83,   5,  82,   6,  51,  25,  24]),
       array([139, 155,  69, 379, 538, 355,  25,  68, 198,  69, 484, 334,  25,  24]),
       array([ 29,  22,  69, 155, 144,  41, 118,  40, 384, 157, 200, 463,  10,
       552, 242, 513, 472, 145, 376,  25,  24])], dtype=object)

In [32]:
import nltk

sentence = nltk.word_tokenize(X[0][0])

In [33]:
sentence

['What',
 'is',
 'the',
 'step',
 'by',
 'step',
 'guide',
 'to',
 'invest',
 'in',
 'share',
 'market',
 'in',
 'india',
 '?']

In [62]:
%%time
index = processor.batch_process(processor.data[:10])

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 263 µs


In [64]:
index.shape

(10, 21)

In [53]:
xt, xlen = batch(index)

In [55]:
xlen

[16, 12, 16, 14, 17, 21, 6, 9, 10, 14]

In [2]:
import tensorflow.contrib.seq2seq as seq2seq

In [3]:
seq2seq.basic_rnn_seq2seq

AttributeError: module 'tensorflow.contrib.seq2seq' has no attribute 'basic_rnn_seq2seq'