In [1]:
import pandas as pd

df = pd.read_csv("data/train.csv")
df = pd.read_csv("data/train.csv")
df = df.dropna()
X1 = df["question1"].values
X2 = df["question2"].values
y = df["is_duplicate"].values
X= [X1, X2]

In [3]:
import numpy as np

def batch(inputs, max_sequence_length=None):
    """
    Args:
        inputs:
            list of sentences (integer lists)
        max_sequence_length:
            integer specifying how large should `max_time` dimension be.
            If None, maximum sequence length would be used
    
    Outputs:
        inputs_time_major:
            input sentences transformed into time-major matrix 
            (shape [max_time, batch_size]) padded with 0s
        sequence_lengths:
            batch-sized list of integers specifying amount of active 
            time steps in each input sequence
    """
    
    sequence_lengths = [len(seq) for seq in inputs]
    batch_size = len(inputs)
    
    if max_sequence_length is None:
        max_sequence_length = max(sequence_lengths)
    
    inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32) # == PAD
    
    for i, seq in enumerate(inputs):
        for j, element in enumerate(seq):
            inputs_batch_major[i, j] = element

    # [batch_size, max_time] -> [max_time, batch_size]
    # inputs_time_major = inputs_batch_major.swapaxes(0, 1)

    return inputs_batch_major, sequence_lengths


def random_sequences(length_from, length_to,
                     vocab_lower, vocab_upper,
                     batch_size):
    """ Generates batches of random integer sequences,
        sequence length in [length_from, length_to],
        vocabulary in [vocab_lower, vocab_upper]
    """
    if length_from > length_to:
            raise ValueError('length_from > length_to')

    def random_length():
        if length_from == length_to:
            return length_from
        return np.random.randint(length_from, length_to + 1)
    
    while True:
        yield [
            np.random.randint(low=vocab_lower,
                              high=vocab_upper,
                              size=random_length()).tolist()
            for _ in range(batch_size)
        ]

In [37]:
import tensorflow as tf
import tensorflow.contrib.rnn as rnn
import tensorflow.contrib.seq2seq as seq2seq
from hedgeable_ai.models.nn import BaseModel, get_shape, get_length

import tensorflow as tf

from hedgeable_ai.models.nn.params import nn_is_logit
from hedgeable_ai.models.nn import MultiNN, get_shape



class DialogueAgent(MultiNN):
    def __init__(self, processor, *args, **kwargs):
        self.emb_size = 300
        self.vocab_size = processor.vocab_size
        super().__init__(processor=processor, *args, **kwargs)
   
        
    def _build_graph(self):
        """Build tensorflow graph
        
        Note:
            You build graphs for output and input, which will be used 
            for training and prediction.
        """
        self.encoder_input = tf.placeholder(tf.int32, shape=(None, None), name="encoder_input")
        self.decoder_input = tf.placeholder(tf.int32, shape=(None, None), name="decoder_input")
        self.target = tf.placeholder(tf.int32, shape=(None, None), name="target")
        word_embeddings = tf.get_variable("word_embeddings", [self.vocab_size, self.emb_size])
        embedded_encoder = tf.gather(word_embeddings, self.encoder_input)
        embedded_decoder = tf.gather(word_embeddings, self.decoder_input)
        #  Build Seq2Seq Model
        cell = [rnn.LSTMCell(512) for _ in range(3)]
        cell = rnn.MultiRNNCell(cell)
        
        # Embedding
        embeddings = tf.Variable(tf.random_uniform([self.vocab_size, self.emb_size], -1.0, 1.0, dtype=tf.float32))
        encoder_input_embedded = tf.nn.embedding_lookup(embeddings, self.encoder_input)
        decoder_input_embedded = tf.nn.embedding_lookup(embeddings, self.decoder_input)
        encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
            cell, encoder_input_embedded, dtype=tf.float32, 
            time_major=False, scope="encoder")
        # We do not need encoder outputs
        del encoder_outputs
        
        # Decoder
        decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
            cell, decoder_input_embedded, initial_state=encoder_final_state,
            dtype=tf.float32, time_major=False, scope="decoder")
        decoder_logits = tf.contrib.layers.fully_connected(
            decoder_outputs, self.vocab_size, activation_fn=None)
        self.decoder_prediction = tf.argmax(decoder_logits, 2)
        
        # Optimization
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.one_hot(self.target, depth=self.vocab_size, dtype=tf.float32),
            logits=decoder_logits)
        self.loss = tf.reduce_mean(cross_entropy)
        self.learning_rate_op = self._get_learning_rate()
        self.train_step =\
            tf.train.AdamOptimizer(self.learning_rate_op).minimize(self.loss)
        # Build tensorboad graph
        with tf.name_scope("summary"):
            self._build_summaries()
        
        # initialize graph
        self.sess.run(tf.global_variables_initializer())
        
    def _optimize(self, batch_X, batch_y, *args, **kwargs):
        feed_dict={
                self.encoder_input: batch_X[0],
                self.decoder_input: batch_X[1],
                self.training: True,
                self.target: batch_y}
        _, loss = self.sess.run([self.train_step, self.loss], feed_dict=feed_dict)
        return loss

In [38]:
from sklearn import preprocessing

class BasicProcessor(object):
    """Process data for estimators."""
        
    def batch_process(self, X, y=None):
        """Make sure to have numpy data for input and target"""
        if y is None:
            return np.array(X)
        else:
            return np.array(X), np.array(y)

    def batch_process_y(self, y):
        return np.array(y)
    
class Word2IndexProcessor(BasicProcessor):
    def __init__(self,  texts):
        _texts = []
        for text in texts:
            _texts.extend(text.split())
        self.encoder = preprocessing.LabelEncoder()
        self.encoder.fit(_texts)
        
    def _encode(self, text):
        # take out 0 for unknown or blank word
        return self.encoder.transform(text.split()) + 1
    
    def _decode(self, index):
        return [self.encoder.inverse_transform(i-1) for i in index]    
    
    def batch_process(self, X, y=None):
        if y is None:
            return np.array([self._encode(x_i) for x_i in X])
        else:
            return np.array([self._encode(x_i) for x_i in X]), np.array(y)
        
    @property
    def vocab_size(self):
        return len(self.encoder.classes_)

In [32]:
%%time

processor = Word2IndexProcessor(X[0])

CPU times: user 14.4 s, sys: 15.3 s, total: 29.6 s
Wall time: 37 s


In [None]:
tf.reset_default_graph()
agent = DialogueAgent(processor)

In [26]:
processor.num_vocab

163360

In [19]:
le.classes_

array(['What', 'by', 'guide', 'in', 'india?', 'invest', 'is', 'market',
       'share', 'step', 'the', 'to'],
      dtype='<U6')

In [13]:
tf.reset_default_graph()

agent = DialogueAgent(20)

AttributeError: 'BasicDecoderOutput' object has no attribute 'get_shape'

In [None]:
tf.ones_like

In [2]:
import tensorflow.contrib.seq2seq as seq2seq

In [3]:
seq2seq.basic_rnn_seq2seq

AttributeError: module 'tensorflow.contrib.seq2seq' has no attribute 'basic_rnn_seq2seq'