# Conll POS LSTM 
### Module & Data-reader Import

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time

import tensorflow.python.platform

import numpy as np
import tensorflow as tf

from tensorflow.models.rnn import rnn_cell
from tensorflow.models.rnn import seq2seq
from tensorflow.models.rnn import rnn
import conll_pos_reader as reader
import pdb

## How this works

Building a tensorflow model have three basic parts (1) specify the model, (2) load in the data (3) specify how the model should run. 

### Specify the model

Tensorflow works by specifying a model ('graph'), and then running this within a 'session'. The model has three parts to it:

- Inference. This basically defines the computations within the neural net. Tensorflow comes with a series of built-in functions that contain the forward, backward and update passes. You use these to construct your net.

- Loss. You need to define a way to calculate the loss associated with a pass through the net.

- Optimiser. This is the way that you update the model parameters. Most popular is stochastic gradient descent.

We are going to want to be able to use the forward-pass on the model to make predictions on a validation set. I.e. we're going to want to keep the (1) inference and (2) loss part of the model, but not call the optimiser on the validation (and test sets). 

For this reason we're going to create a model class, to which we can pass a parameter "training", to use with validation and test sets.


In [None]:
class Conll_POS_Model(object):
    def __init__(self, is_training, pos_vocab):
        """
        The init function takes the is_training parameter (for the validation sets), and the pos_vocab size.
        The POS_vocab size is the number of possible POS tags. This is for shaping the size of the output layer.
        """
        
        """
        Configuration.
        A model has a fair number of configuration variables, defined here.
        - init_scale - the initial scale of the weights
        - learning_rate - the initial value of the learning rate
        - max_grad_norm - the maximum permissible norm of the gradient
        - num_layers - the number of LSTM layers
        - num_steps - the number of unrolled steps of LSTM
        - hidden_size - the number of LSTM units
        - max_epoch - the number of epochs trained with the initial learning rate
        - max_max_epoch - the total number of epochs for training
        - keep_prob - the probability of keeping weights in the dropout layer
        - lr_decay - the decay of the learning rate for each epoch after "max_epoch"
        - batch_size - the batch size
        """
        
        self.init_scale = init_scale = 0.1
        self.learning_rate = learning_rate = 1.0
        self.max_grad_norm = max_grad_norm = 5
        self.num_layers = num_layers = 2
        self.num_steps = num_steps = 10
        self.hidden_size = hidden_size = 200 # update to size
        self.max_epoch = max_epoch = 4 
        self.max_max_epoch = max_max_epoch = 13
        self.keep_prob = keep_prob = 1.0
        self.lr_decay = lr_decay = 0.5
        self.batch_size = batch_size = 20
        self.vocab_size = vocab_size = 20000
        
        # ======================================
        # 1. Inference
        # ======================================
        

        # create the placeholders for inputs/outpus. Basically, this tells the graph how big the inputs
        # are going to be and allocates space to them.
        self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        
        # add exit size - NB. make sure to convert one-hot back to integer
        self._targets = tf.placeholder(tf.float32, [batch_size, num_steps])
        
        
        ## here we create the lstm cells - built in tensorflow
        lstm_cell = rnn_cell.BasicLSTMCell(hidden_size, forget_bias=0.0)
        
        ## here we use our is_training parameter to only apply dropout when training
        if is_training and config.keep_prob < 1:
            lstm_cell = rnn_cell.DropoutWrapper(
                lstm_cell, output_keep_prob=config.keep_prob)
        
        ## we can create a multi-layer lstm super-easily using multiRNNcell
        cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers)
        
        ## initialise variables. Note this doesn't actually initialise until we're in a session
        self._initial_state = cell.zero_state(batch_size, tf.float32)
        
        ## choise to use cpu or gpu - not needed on mac but kept in if anyone else is using this
        with tf.device("/cpu:0"):
            ## we're going to embed our words as vectors
            embedding = tf.get_variable("embedding", [vocab_size, hidden_size])
            # N.B. - need to check what this is doing!!!!!!
            inputs = tf.nn.embedding_lookup(embedding, self._input_data)
        
        ## again we've got our training flag for the dropout
        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)

        # outputs = []
        # output = []
        # state = self._initial_state
        # with tf.variable_scope("RNN"):
        #     for time_step in range(num_steps):
        #         if time_step > 0: tf.get_variable_scope().reuse_variables()
        #         (cell_output, state) = cell(inputs[:, time_step, :], state)
        #         outputs.append(cell_output)
        #         output = cell_output

        
        # NB Need to check this code.
        inputs = [tf.squeeze(input_, [1])
                  for input_ in tf.split(1, num_steps, inputs)]
        outputs, state = rnn.rnn(cell,
                                  inputs, initial_state=self._initial_state)
        
        
        # ====================================
        # 2. Specifying the Loss Function
        # ====================================
        
        # get the weights
        softmax_w = tf.get_variable("softmax_w", [hidden_size, vocab_size])
        softmax_b = tf.get_variable("softmax_b", [vocab_size])
        # do the logit calculation
        logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
        pdb.set_trace()
        # calculate the Loss
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits, self._targets))
        
        # update the parameters of the object
        self._cost = cost = loss
        self._final_state = state
        
        # =====================================
        # 3. Specifying the Optimiser
        # =====================================
        
        
        # flag for the training part
        if not is_training:
            return

        # learning rate
        self._lr = tf.Variable(0.0, trainable=False)
        # get the trainable variables
        tvars = tf.trainable_variables()
        # calculate gradients and clip
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          config.max_grad_norm)
        # specify your optimiser
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
        
        # do the updates
        self._train_op = optimizer.apply_gradients(zip(grads, tvars))

    def assign_lr(self, session, lr_value):
        session.run(tf.assign(self.lr, lr_value))

    @property
    def input_data(self):
        return self._input_data

    @property
    def targets(self):
        return self._targets

    @property
    def initial_state(self):
        return self._initial_state

    @property
    def cost(self):
        return self._cost

    @property
    def final_state(self):
        return self._final_state

    @property
    def lr(self):
        return self._lr

    @property
    def train_op(self):
        return self._train_op