In [1]:
from hedgeable_ai.functions.preprocessing import SentenceProcessor

In [2]:
processor = SentenceProcessor(None, "d2v", vector_file_path="data/enwiki_dbow/doc2vec.bin")

In [74]:
import pandas as pd

df = pd.read_csv("data/train.csv")
df = df.dropna()
X1 = df["question1"].values
X2 = df["question2"].values
y = df["is_duplicate"].values
X= [X1, X2]

In [53]:
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from logging import getLogger


from hedgeable_ai.models.nn import NNTrainMixin
from hedgeable_ai.models.nn.utils import generator

overwrite = True
save_file_path = None

logger = getLogger(__name__)

class MultiNNTrainMixin(NNTrainMixin):
    def train(self, train_X, train_y=None, valid_X=None, valid_y=None, num_epochs=100, 
              valid_freq=1, log_freq=1, is_processed=False, 
              overwrite=overwrite, save_file_path=save_file_path, *args, **kwargs):
        """Train neural network model
        
        Args:
            train_X, train_y: list(array-like), this is preprocessed by 
                self.processor.batch_process
            valid_X, valid_y: array-like(optional), if they are feeded, they
                will be used as validation set
            is_processed: bool, if True, skip preprocessing
            args, kwargs: parameters for score function
        """
        # check if there is already the same name file
        self.save_params(self.save_file_path, overwrite)
        self.is_trained = True
        num_train = len(train_X[0])
        # Each batch is a generator that returns list of data
        train_batch_X, train_batch_y = self._get_batch(train_X, train_y, is_processed)
        valid_batch_X, valid_batch_y = self._get_batch(valid_X, valid_y, is_processed)
        logger.debug("start training!")
        try:
            for i in tqdm(range(num_epochs)):
                self.sess.run(self.update_step_op);
                epoch_loss = []
                for batch_i in range(num_train//self.batch_size + 1):
                    batch_X = next(train_batch_X)
                    if train_batch_y is not None:
                        batch_y = next(train_batch_y)[0]
                    else:
                        batch_y = None
                    batch_loss = self._optimize(batch_X, batch_y, num_data=num_train)
                    epoch_loss.append(batch_loss)
                step = self.global_step.eval(session=self.sess)
                if step % log_freq == 0:
                    lr_val = self.learning_rate_op.eval(session=self.sess)
                    tag_dict = {'loss': np.mean(epoch_loss), "learning_rate":lr_val}
                    self._inject_summary(tag_dict)
                self._epoch_func(X=train_X, y=train_y, is_processed=is_processed)
                accuracies = []
                if valid_y is not None:
                    num_valid = len(valid_y)
                    # check accuracy every print_freq epoch
                    if step % valid_freq == 0:
                        accuracies = []
                        logits_list = []
                        for batch_i in range(num_valid//self.batch_size + 1):
                            batch_X = next(valid_batch_X)
                            batch_y = next(valid_batch_y)[0]
                            _score = self.score(batch_X, batch_y, is_training=False,
                                                is_processed=True, *args, **kwargs)
                            accuracies.append(_score)
                        accuracy = np.mean(accuracies)
                        print("accuracy: ", accuracy)
        except KeyboardInterrupt:
            logger.debug("Save model parameters before finishing training...")
        finally:
            self.save_params(save_file_path, overwrite=True)
        logger.debug("finished training")
        
    def _optimize(self, batch_X, batch_y, *args, **kwargs):
        feed_dict={
            self.target: batch_y,
            self.training: True
        }
        for input_, X in zip(self.input, batch_X):
            feed_dict[input_] = X
        _, loss = self.sess.run([self.train_step, self.loss], feed_dict=feed_dict)
        return loss
        
    def _get_batch(self, X, y=None, is_mulinput=False, is_processed=False):
        if X is not None:
            # preprocess data
            if not is_processed:
                X = [self.processor.batch_process(X_i) for X_i in X]
                if y is not None:
                    y = self.processor.batch_process_y(y)
            batch_X = generator(X, self.batch_size)
            if y is not None:
                batch_y = generator([y], self.batch_size)
            else:
                batch_y = None
        else:
            batch_X = None
            batch_y = None
        return batch_X, batch_y

In [54]:
from hedgeable_ai.models.nn import BaseNN

is_training=True
is_processed=False

class MultiNN(MultiNNTrainMixin, BaseNN):
    def __init__(self, input_dim, output_dim, input_model, output_model, conf=None,
                 sess=None, default_conf=None, *args, **kwargs):
        super().__init__(input_dim, output_dim, output_model, conf,
                 sess, default_conf, *args, **kwargs)
        
    def _calc_output(self, X, is_training=is_training, is_processed=is_processed):
        """Return model output
        
        Returns:
            list(array-like)
        """
        if not self.is_trained:
            raise NotTrainedError("Train model by 'train' beforehand")
        if not is_processed:
            X = self.processor.batch_process(X)
        num_data = len(X)
        output_list = []
        for batch_i in range(num_data // self.batch_size + 1):
            feed_dict = {self.training: is_training}
            for _input, _X in zip(self.input, X_):
                batch_X = _X[self.batch_size * batch_i : self.batch_size * (batch_i+1)]
                feed_dict[_input] = batch_X
            output_list.extend(self.sess.run(self.output, feed_dict=feed_dict))
        return np.array(output_list)

In [58]:
import tensorflow as tf

from hedgeable_ai.models.nn.params import nn_is_logit
from hedgeable_ai.models.nn import BaseNN, get_shape
from hedgeable_ai.functions.classification import ClassifierMixin


class MultiNNClassifier(MultiNN, ClassifierMixin):
    """Classifier based on neural network"""
    
    def __init__(self, input_dim, output_dim, input_model, output_model, conf,
                 is_logit=nn_is_logit, *args, **kwargs):
        """Initialize classifier with nerual network model
        
        Args:
            model: class, neural network class
            is_logit: bool(optional), if False, output of an estimator
                as prediction directly
            args, kwargs: parameters for parents class
        """
        self.is_logit = is_logit
        self.input_model = []
        for i, model in enumerate(input_model):
            self.input_model.append(model(None, conf["input_model"][i], "input_model_%d" % i))
        self.output_model = output_model(output_dim, conf["model"], "output_model")
        super().__init__(input_dim, output_dim, input_model=input_model,
                         output_model=output_model, conf=conf, *args, **kwargs)
        
        
    def _build_graph(self):
        """Build tensorflow graph
        
        Note:
            You build graphs for output and input, which will be used 
            for training and prediction.
        """
        self.epoch = tf.Variable(0, name="epoch", trainable=False)
        self.input = []
        for _input_dim in self.input_dim:
            _input_dim = get_shape(_input_dim, is_sequence=False)
            self.input.append(tf.placeholder(tf.float32, shape=_input_dim))
        self.target = tf.placeholder(tf.int32, shape=(None,), name="target")
        self.training = tf.placeholder(tf.bool, (), name="training")
        outputs = []
        for _input, model in zip(self.input, self.input_model):
            outputs.append(model(_input, self.training))
        output_input = tf.concat(outputs, axis=-1)
        self.output = self.output_model(output_input)
        
        # build optimizer
        if self.is_logit:
            if self.output_dim==1:
                _output = tf.squeeze(self.output)
                self.loss =\
                    tf.nn.sigmoid_cross_entropy_with_logits(
                        labels=tf.cast(self.target, tf.float32),
                        logits=_output)
            else:
                _target = tf.one_hot(self.target, self.output_dim)
                self.loss =\
                    tf.nn.softmax_cross_entropy_with_logits(
                        labels=tf.cast(_target, tf.float32),
                        logits=self.output)
        else:
            raise NotImplementedError("We have not implemeted non logit output model")
        self.learning_rate_op = self._get_learning_rate()
        self.train_step =\
            tf.train.AdamOptimizer(self.learning_rate_op).minimize(self.loss)
        # Build tensorboad graph
        with tf.name_scope("summary"):
            self._build_summaries()
        
        # initialize graph
        self.sess.run(tf.global_variables_initializer())

In [59]:
from hedgeable_ai.models.nn import get_shape
from hedgeable_ai.models.nn import mlp_conf
from hedgeable_ai.models.nn.ff.core import FeedForward


class MLPModel(FeedForward):
    def __init__(self, output_dim=None, model_params=None, scope_name=None, *args, **kwargs):
        if model_params is None:
            model_params = mlp_conf["model"]
        if scope_name is None:
            scope_name = "mlp"
        if output_dim  is not None:
            model_params.append({"name": "dense", "num_hidden": output_dim})
        super().__init__(model_params, scope_name, *args, **kwargs)

In [None]:
conf = {"model":[{"name": "dense", "num_hidden": 100, "is_batch": True, "activation": tf.nn.relu},
                 {"name": "dense", "num_hidden": 10, "is_batch": False, "activation": tf.nn.relu}],
        "input_model":[[{"name": "dense", "num_hidden": 100, "is_batch": True, "activation": tf.nn.relu},
                             {"name": "dense", "num_hidden": 100, "is_batch": True, "activation": tf.nn.relu},
                             {"name": "dense", "num_hidden": 100, "is_batch": True, "activation": tf.nn.relu}],
                            [{"name": "dense", "num_hidden": 100, "is_batch": True, "activation": tf.nn.relu},
                             {"name": "dense", "num_hidden": 100, "is_batch": True, "activation": tf.nn.relu},
                             {"name": "dense", "num_hidden": 100, "is_batch": True, "activation": tf.nn.relu}]]}
tf.reset_default_graph()
N = 1000000
input_model = [MLPModel, MLPModel]
output_model = MLPModel
model = MultiNNClassifier([300, 300], 1, input_model, output_model, conf, processor=processor)
model.train([X[0][:N], X[1][:N]], y[:N])

  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.getargspec(target))
  if d.decorator_argspec is not None), _inspect.

Model saved in file: params/model.ckpt



  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [01:08<1:52:31, 68.20s/it]

In [72]:
for i in range(len(X[1]) // 10):
    X_i = X[1][i*10:(i+1)*10]
    try:
        df = processor.batch_process(X_i)
    except:
        print(X_i)

[nan 'What are some of the best short films available on the web?'
 'Why does zero factorial (0!) equal one (1)?'
 'Is it possible to see when a Quora question was asked and who asked it?'
 'How close is a World War III?' 'What if there is no moon?'
 "What's the process to start study of IAS?"
 'What is it like to randomly meet Jennifer Aniston?'
 'If I have a tattoo can I donate blood?'
 'How did you make money as a 13-year-old?']
['How do I control my emotions and anger?' nan
 'What is meant by qualitative and quantitative research?'
 'Can you feed mealworms to a leopard gecko?'
 'How can I get stiff and lean body?'
 'How do I control my feelings of liking someone?'
 'Why was the eastern roman empire wealthier than the west?'
 'How can I focus in class?'
 "I'm an 18 year old male and skinny. What's a good workout routine to gain muscle?"
 "What should a man do in his thirties who wants to pursue CA but can't go for 3 year articleship because of his full time job?"]


In [70]:
df.shape

(404290, 300)

In [66]:
X[1].shape

(404290,)

In [67]:
y.shape

(404290,)