# Run this first

In [310]:
import time  # This is required to include time module.
import numpy as np
import scipy.io as sio

!wget http://www.ucode.es/data.mat
!ls

--2018-02-04 14:13:59--  http://www.ucode.es/data.mat
Resolving www.ucode.es (www.ucode.es)... 151.101.53.147, 2a04:4e42:d::403
Connecting to www.ucode.es (www.ucode.es)|151.101.53.147|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7468024 (7.1M) [application/octet-stream]
Saving to: ‘data.mat.7’


2018-02-04 14:13:59 (21.3 MB/s) - ‘data.mat.7’ saved [7468024/7468024]

datalab   data.mat.1  data.mat.3  data.mat.5  data.mat.7
data.mat  data.mat.2  data.mat.4  data.mat.6


# This method loads the training, validation and test set.
    # It also divides the training set into mini-batches.
    # Inputs:
    #   N: Mini-batch size.
    # Outputs:
    #   train_input: An array of size D X N X M, where
    #                 D: number of input dimensions (in this case, 3).
    #                 N: size of each mini-batch (in this case, 100).
    #                 M: number of minibatches.
    #   train_target: An array of size 1 X N X M.
    #   valid_input: An array of size D X number of points in the validation set.
    #   test: An array of size D X number of points in the test set.
    #   vocab: Vocabulary containing index to word mapping.

In [312]:
def load_data(batch_size=100):
  

  data = sio.loadmat('data.mat')
  #data
  test_data = data['data'][0][0][0]
  train_data = data['data'][0][0][1]
  valid_data = data['data'][0][0][2]
  vocab = data['data'][0][0][3]
  print(test_data.shape)
  print(train_data.shape)
  print(valid_data.shape)
  print(vocab.shape)
  
  numdims = train_data.shape[0]
  D = numdims - 1
  M = train_data.shape[1] / batch_size
  
  
  train_input = np.reshape(train_data[0:D, 0:batch_size * M], (D, batch_size, M))
  train_target = np.reshape(train_data[D, 0:batch_size * M], (1, batch_size, M))
  valid_input = valid_data[0:D, :]
  valid_target = valid_data[D, :]
  test_input = test_data[0:D, :]
  test_target = test_data[D, :]
  vocab = vocab

  print(train_input.shape)
  print(train_target.shape)
  print(valid_input.shape)
  print(valid_target.shape)
  print(test_input.shape)
  print(test_target.shape)
  print(vocab.shape)

  return train_input, train_target, valid_input, valid_target, test_input, test_target, vocab


(4, 46568)
(4, 372550)
(4, 46568)
(1, 250)


# Inputs:
    #   epochs: Number of epochs to run.
# Output:
    #   model: A struct containing the learned weights and biases and vocabulary.

# SET HYPERPARAMETERS HERE.

In [0]:
epochs = 10

batchsize = 100 # Mini-batch size.
learning_rate = 0.1 # Learning rate; default = 0.1.
momentum = 0.5 # Momentum; default = 0.9.
numhid1 = 50 # Dimensionality of embedding space; default = 50.
numhid2 = 200 # Number of units in hidden layer; default = 200.
init_wt = 0.01 # Standard deviation of the normal distribution
# which is sampled to get the initial weights; default = 0.01



# VARIABLES FOR TRACKING TRAINING PROGRESS.

In [0]:
show_training_CE_after = 100 # Frequency with which to show training cross-entropy error
show_validation_CE_after = 1000 # Frequency with which to show validation cross-entropy error

# LOAD DATA.

In [317]:
train_input, train_target, valid_input, valid_target, test_input, test_target, vocab = load_data(batchsize)
numwords, batchsize, numbatches = train_input.shape
vocab_size = vocab.shape[1]
print(vocab_size)

250


# INITIALIZE WEIGHTS AND BIASES.

In [0]:

word_embedding_weights =  np.random.normal(loc=0, scale=init_wt, size=(vocab_size, numhid1))
embed_to_hid_weights = np.random.normal(loc=0, scale=init_wt, size=(numwords * numhid1, numhid2))
hid_to_output_weights = np.random.normal(loc=0, scale=init_wt, size=(numhid2, vocab_size))
hid_bias = np.zeros((numhid2, 1))
output_bias = np.zeros((vocab_size, 1))


In [0]:
word_embedding_weights_delta = np.zeros((vocab_size, numhid1))
word_embedding_weights_gradient = np.zeros((vocab_size, numhid1))
embed_to_hid_weights_delta = np.zeros((numwords * numhid1, numhid2))
hid_to_output_weights_delta = np.zeros((numhid2, vocab_size))
hid_bias_delta = np.zeros((numhid2, 1))
output_bias_delta = np.zeros((vocab_size, 1))
expansion_matrix = np.eye(vocab_size)
count = 0
tiny = np.exp(-30)

# This method forward propagates through a neural network.



---


*     Inputs:
 *       input_batch: The input data as a matrix of size numwords X batchsize where,
         numwords is the number of words, batchsize is the number of data points.
         So, if input_batch(i, j) = k then the ith word in data point j is word
         index k of the vocabulary.
    
 *      word_embedding_weights: Word embedding as a matrix of size
         vocab_size X numhid1, where vocab_size is the size of the vocabulary
         numhid1 is the dimensionality of the embedding space.
    
 *     embed_to_hid_weights: Weights between the word embedding layer and hidden
         layer as a matrix of soze numhid1*numwords X numhid2, numhid2 is the
         number of hidden units.
    
 *       hid_to_output_weights: Weights between the hidden layer and output softmax
                   unit as a matrix of size numhid2 X vocab_size
    
 *       hid_bias: Bias of the hidden layer as a matrix of size numhid2 X 1.
    
 *       output_bias: Bias of the output layer as a matrix of size vocab_size X 1.
    

---


*     Outputs:
 *       embedding_layer_state: State of units in the embedding layer as a matrix of
         size numhid1*numwords X batchsize
    
 *       hidden_layer_state: State of units in the hidden layer as a matrix of size
         numhid2 X batchsize
    
 *      output_layer_state: State of units in the output layer as a matrix of size
         vocab_size X batchsize
    

In [0]:
def fprop(input_batch=None, word_embedding_weights=None,
          embed_to_hid_weights=None, hid_to_output_weights=None,
          hid_bias=None, output_bias=None):
    

    numwords, batchsize = input_batch.shape
    vocab_size, numhid1 = word_embedding_weights.shape
    numhid2 = embed_to_hid_weights.shape[1]

    #% COMPUTE STATE OF WORD EMBEDDING LAYER.
    # Look up the inputs word indices in the word_embedding_weights matrix.
    foo = word_embedding_weights[np.ndarray.flatten(input_batch-1), :].T 
    
    embedding_layer_state = np.reshape(foo, (numhid1 * numwords, -1))

    #% COMPUTE STATE OF HIDDEN LAYER.
    # Compute inputs to hidden units.
    
    inputs_to_hidden_units = np.matmul(embed_to_hid_weights.T,embedding_layer_state) + np.tile(hid_bias, (1, batchsize))

    # Apply logistic activation function.
    hidden_layer_state = np.divide(1,  (1 + np.exp(-inputs_to_hidden_units)))

    #% COMPUTE STATE OF OUTPUT LAYER.
    # Compute inputs to softmax.
    inputs_to_softmax = np.matmul(hid_to_output_weights.T, hidden_layer_state) + np.tile(output_bias, (1, batchsize))
    # Subtract maximum. 
    # Remember that adding or subtracting the same constant from each input to a
    # softmax unit does not affect the outputs. Here we are subtracting maximum to
    # make all inputs <= 0. This prevents overflows when computing their
    # exponents.
    inputs_to_softmax = inputs_to_softmax - np.tile(np.max(inputs_to_softmax), (vocab_size, 1))

    # Compute exp.
    output_layer_state = np.exp(inputs_to_softmax)

    # Normalize to get probability distribution.
    output_layer_state =  np.divide(output_layer_state, np.tile(np.sum(output_layer_state, 0), (vocab_size, 1)))
    return embedding_layer_state, hidden_layer_state, output_layer_state

# Training Loop :)

In [334]:
tic_ = time.time()

for epoch in range(1, epochs+1):
  
      print('Epoch %d\n'% epoch)
      this_chunk_CE = 0
      trainset_CE = 0
      
       # LOOP OVER MINI-BATCHES.
      for m in range(1, numbatches):
          input_batch = train_input[:, :, m]
          target_batch = train_target[:, :, m]
          
          # FORWARD PROPAGATE.
          # Compute the state of each layer in the network given the input batch
          # and all weights and biases
          embedding_layer_state, hidden_layer_state, output_layer_state = \
          fprop(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias)
          
           # COMPUTE DERIVATIVE.
          #% Expand the target to a sparse 1-of-K vector.
          expanded_target_batch = expansion_matrix[:, target_batch-1]
          #% Compute derivative of cross-entropy loss function.
          error_deriv = output_layer_state - np.squeeze(expanded_target_batch)
          
           # MEASURE LOSS FUNCTION.
          CE = -np.sum(np.sum(np.multiply(expanded_target_batch, np.log(output_layer_state + tiny)))) / batchsize
          count = count + 1
          this_chunk_CE = this_chunk_CE + (CE - this_chunk_CE) / count
          trainset_CE = trainset_CE + (CE - trainset_CE) / m
          print 'Batch %d Train CE %1.3f'% (m, this_chunk_CE)
          if m % show_training_CE_after == 0:
              print('\n')
              count = 0
              this_chunk_CE = 0

          ################
          # BACK PROPAGATE.
          #% OUTPUT LAYER.
          hid_to_output_weights_gradient = np.matmul(hidden_layer_state,  error_deriv.T)
          output_bias_gradient = np.sum(error_deriv, 1)
          back_propagated_deriv_1_ = np.multiply(np.matmul(hid_to_output_weights, error_deriv), hidden_layer_state)
          back_propagated_deriv_1 = np.multiply(back_propagated_deriv_1_, (1 - hidden_layer_state))
          

          #% HIDDEN LAYER.

          embed_to_hid_weights_gradient = np.matmul(embedding_layer_state, back_propagated_deriv_1.T)

          hid_bias_gradient = np.sum(back_propagated_deriv_1, 1)

          back_propagated_deriv_2 = np.matmul(embed_to_hid_weights, back_propagated_deriv_1)

          word_embedding_weights_gradient[:] = 0
          
          
          #% EMBEDDING LAYER.
          for w in range(0, numwords):
            bar = expansion_matrix[:, input_batch[w, :]-1]
            bareto = back_propagated_deriv_2[(w * numhid1) : ((w+1) * numhid1), :].T
            
            foo = np.matmul(bar, bareto)
            word_embedding_weights_gradient = word_embedding_weights_gradient + foo
          
          # UPDATE WEIGHTS AND BIASES.
          word_embedding_weights_delta = np.multiply(momentum, word_embedding_weights_delta) + np.divide(word_embedding_weights_gradient, batchsize)
          word_embedding_weights = word_embedding_weights - (learning_rate * word_embedding_weights_delta)

          embed_to_hid_weights_delta = np.multiply(momentum, embed_to_hid_weights_delta) + (embed_to_hid_weights_gradient / batchsize)
          embed_to_hid_weights = embed_to_hid_weights - (learning_rate * embed_to_hid_weights_delta)

          hid_to_output_weights_delta = np.multiply(momentum, hid_to_output_weights_delta) + (hid_to_output_weights_gradient / batchsize)
          hid_to_output_weights = hid_to_output_weights - (learning_rate * hid_to_output_weights_delta)

          hid_bias_delta = (momentum * hid_bias_delta) + (np.reshape(hid_bias_gradient, (-1, 1)) / batchsize)
          hid_bias = hid_bias - (learning_rate * hid_bias_delta)

          output_bias_delta = np.multiply(momentum, output_bias_delta) + (np.reshape(output_bias_gradient, (-1, 1)) / batchsize)
          output_bias = output_bias - (learning_rate * output_bias_delta)
          
          # VALIDATE.
          if m % show_validation_CE_after == 0:
              print('Running validation ...')
              
              embedding_layer_state, hidden_layer_state, output_layer_state = fprop(valid_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias)
              datasetsize = valid_input.shape[1]
              expanded_valid_target = expansion_matrix[:, valid_target-1]
              CE = -np.sum(np.sum(np.multiply(expanded_valid_target, np.log(output_layer_state + tiny)))) / datasetsize
              print 'Validation CE %1.3f \n' % (CE)
              
          
          
print('Finished Training.\n')

  
print'Final Training CE %.3f\n' % trainset_CE

toc_ = time.time()
print('Training took %.2f seconds\n'% (toc_ - tic_))



Epoch 1



NameError: ignored

Save stuff to model

In [335]:
    model.word_embedding_weights = word_embedding_weights
    model.embed_to_hid_weights = embed_to_hid_weights
    model.hid_to_output_weights = hid_to_output_weights
    model.hid_bias = hid_bias
    model.output_bias = output_bias
    model.vocab = vocab

NameError: ignored

# Testing Time

In [327]:

    # EVALUATE ON VALIDATION SET.
    print('Running validation ...')

    [embedding_layer_state, hidden_layer_state, output_layer_state] = fprop(valid_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias)
    datasetsize = valid_input.shape[1]
    expanded_valid_target = expansion_matrix[:, valid_target - 1]
    CE = -np.sum(np.sum(np.multiply(expanded_valid_target, np.log(output_layer_state + tiny)))) / datasetsize
    print('Final Validation CE %.3f\n' % CE)

    # EVALUATE ON TEST SET.
    print('Running test ...')
 
    [embedding_layer_state, hidden_layer_state, output_layer_state] = fprop(test_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias)
    datasetsize = test_input.shape[1]
    expanded_test_target = expansion_matrix[:, test_target-1]
    CE = -np.sum(np.sum(np.multiply(expanded_test_target, np.log(output_layer_state + tiny)))) / datasetsize
    print('Final Test CE %.3f' % CE)

  

Running validation ...
(46568,)
Final Validation CE 2.905

Running test ...
Final Test CE 2.902


NameError: ignored