In [23]:
""" Implementing decoder"""

"""Imports"""
import numpy as np
from nltk import sent_tokenize, word_tokenize

"""Global definitons"""
_start = 'S_START'
_end = 'S_END'
_unk = 'UNK'

In [24]:
""" util definitions"""

def hyperbolic(net):
    return np.tanh(net)

def relu(net):
    return np.maximum(0,net)

def softmax(net):
    _exp = np.exp(net)
    return _exp/np.sum(_exp)

def predict(scores):
    return np.argmax(scores)

In [25]:
# """ Word preprocessing """
# def dataset(_fi1='/Users/preethikapachaiyappa/Documents/MachineLearning/Data/English/9-11-in-perspective.txt', _fi1='/Users/preethikapachaiyappa/Documents/MachineLearning/Data/French/9-11-in-perspective.txt'):
#     file_in_english = open(_fi1)
#     file_in_french = open(_fi2)
#     #file_out = open(_fo,'wb')

#     words = [] #stores unique words encountered in the document as WordItem objects
#     _dict = {} #temporary dictionary to maintain count of each word
    
#     _dict['UNK'] = 0
#     sentence_embeddings = []

#     count = 0
#     for l in file_in_english:
#         print count, l
#         count++
#         l = _start+' '+l+' '+_end
#         split = word_tokenize(l)
#         for w in split:
#             if w in _vocab :
#                 word_index = _vocab[w]
#             else :
#                 word_index = _vocab['UNK']
#             np.append(sentence_embeddings,W_Embedding[word_index],axis=0)
#         backpropogate(sentence_embeddings)
#     file_in.close()
#     #file_out.close()
    
#     return _vocab, words

In [59]:
class RNNlayer:
    
    """ 
    RNN nodes for decoder
    
    hidden state at time step t of decoder is conditioned on hidden state at time step t-1,
    output at time step t-1 and input at time step t
    """
    
    def __init__(self, inputSize, outputSize, bptt_truncate = 5, hiddenDim = 10):
        """
        inputSize = dimensions of the input embedding 
        outputSize = vocabulary size
        hiddenDim = size of the hidden unit in RNN
        bptt_truncate = truncate the number of time steps we calculate the gradient during backpropagation
        """
        self.inputSize = inputSize
        self.outputSize = outputSize
        self.hiddenDim = hiddenDim
        self. bptt_truncate = bptt_truncate
        
        self.w_in = np.random.uniform(-np.sqrt(1./inputSize), np.sqrt(1./inputSize),(hiddenDim, inputSize))
        self.w_hh = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(hiddenDim, hiddenDim))
        #self.w_outH = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
        self.w_out = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
        
    def forwardProp(self, inSentence, expSent):
        """
        inSentence: word indices in input language vocabulary
        expSent: word indices in target language vocabulary
        """
        
        #Total number of time steps equal to number of words in the sentence
        T = len(expSent)
        
        #Saving all hidden states and outputs during forward propagation
        _h = np.zeros((T,self.hiddenDim))
        _o = np.zeros((T,self.outputSize))
        
        #Initializing initial output as the start token
        #_o[-1] = 
        
        #For each time step calculating hidden state and output
        for t in np.arange(T):
            #outIdx = predict(_o[t-1])
            _h[t] = hyperbolic(self.w_in.dot(inSentence[t]) + self.w_hh.dot(_h[t-1])) #+ self.w_outH[:,outIdx:outIdx+1])
            _o[t] = softmax(self.w_out.dot(_h[t]))
            
        return _o, _h
    
    def calculateLoss(self, inSentence, expSentence):
        
        #For each sentence
        o, h = self.forwardProp(inSentencecontext, expSentence)
        #TODO recheck this part
        correctPred = o[np.arange(len(expSentence)), expSentence]
        #Loss for each sentence
        l = -1 * np.sum(np.log(correctPred))
        return l
    
    def calculateTotalLoss(self, inSentence, expSentences):
        
        L = 0.0
        for i in len(inSentence):
            L += self.calculateLoss(inSentencecontext[i], expSentences[i])
            
        return L
    
    def backPropTT(self, inSentence, expSentence):
        
        # Total number of time steps equal to number of words in the sentence
        T = len(expSentence)
        
        # Performing forward propagation
        o, h = self.forwardProp(inSentence, expSentence)
        
        # Defining gradient variables
        dLdin = np.zeros(self.w_in.shape)
        dLdhh = np.zeros(self.w_hh.shape)
        #dLdoutH = np.zeros(self.w_outH.shape)
        dLdout = np.zeros(self.w_out.shape)
        
        # Calculating the difference between output and actual output
        delta_o = o
        delta_o[np.arange(T), expSentence] -= 1
        print 'delta_o', delta_o
        
        # Calculating gradients backwards through time
        for t in np.arange(T)[::-1]:
            #Output gradient is only dependent on time step t
            dLdout += np.outer(delta_o[t], h[t])
            
            # Initial delta calculation propagating gradients from output
            delta_t = self.w_out.T.dot(delta_o[t]) * (1 - (h[t] ** 2))
            
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
                # Add to gradients at each previous step
                dLdhh += np.outer(delta_t, h[bptt_step-1])              
                dLdin += np.outer(delta_t, inSentence[bptt_step-1])
                #dLdoutH += np.outer(delta_t, o[bptt_step-1])
                # Update delta for next step dL/dz at t-1
                delta_t = self.w_hh.T.dot(delta_t) * (1 - h[bptt_step-1] ** 2)
            """TODO review backprop implementation"""
            
        #return dLdin, dLdhh, dLdoutH, dLdout
        return dLdin, dLdhh, dLdout
    
    def sgd_step(self, inSentence, expSentence, learningRate):
        
        """ Performs a single stochastic gradient step"""
        
        # Calculating gradients
        #dLdin, dLdhh, dLdoutH, dLdout = self.backPropTT(inSentence, expSentence)
        dLdin, dLdhh, dLdout = self.backPropTT(inSentence, expSentence)
        
        # Updating parameters
        self.w_in -= learningRate * dLdin
        self.w_hh -= learningRate * dLdhh
        #self.w_outH -= learningRate * dLdoutH
        self.w_out -= learningRate * dLdout
        
    def train_Decoder_With_SGD(self, X_train, Y_train, learningRate = 0.05, nepochs = 20):
        """TODO evaluate losses and update learning rate if required"""
        for epoch in range(nepochs):
            for i in range(len(Y_train)):
                self.sgd_step(X_train[i], Y_train[i], learningRate)
                print 'W_in ', self.w_in
                print 'W_hh ', self.w_hh
                print 'W_out ', self.w_out




In [56]:
inSentence = [[[1,1,1,1,1,1,1,1,1,1],[1,1,1,1,1,1,1,1,1,1],[1,1,1,1,1,1,1,1,1,1]]]
expSentence = [[2,2,2]]

In [60]:
vocabSize = 5
embSize = 10

W_out = np.random.randn(vocabSize, embSize)
W_hh = np.random.randn(embSize, embSize)
W_in = np.random.randn(embSize)

a = RNNlayer(10,5)
a.train_Decoder_With_SGD(inSentence, expSentence)


delta_o [[ 0.17372329  0.27635341 -0.85255749  0.21370922  0.18877157]
 [ 0.1713561   0.29425283 -0.84750086  0.22179058  0.16010134]
 [ 0.17963879  0.28972374 -0.84374007  0.20822507  0.16615246]]
W_in  [[ 0.23012427  0.22612692 -0.27236757  0.10315173  0.24677808  0.19527676
  -0.00404455 -0.08525427 -0.13945688 -0.00599491]
 [ 0.28998634  0.08015725 -0.12884331 -0.3209673  -0.16199022  0.2620389
  -0.0088029  -0.12080618  0.05124708  0.06730365]
 [-0.16309936 -0.27250664  0.14776729  0.05172962  0.14749153 -0.12136415
   0.06071042 -0.01352919  0.11993295 -0.06318021]
 [ 0.16185649 -0.29525843  0.16559741 -0.22615673 -0.08385462  0.11433559
  -0.20699537  0.00516743  0.26595867  0.09627161]
 [ 0.04581292  0.03070604  0.07468603  0.18897819 -0.00147322 -0.11271441
   0.11259599 -0.27170597 -0.07457931  0.22894256]
 [-0.15623144 -0.24825559 -0.2752629  -0.22506984  0.14155201 -0.00696731
   0.06611175 -0.07532389 -0.08176523 -0.18188338]
 [ 0.18585012 -0.21669664 -0.20116786 -0.131620

In [54]:
a = np.array([[1,2,3],[4,5,6]])
a[np.arange(2),[1,2]] -= 1
print a[1].T
print np.outer(a[0],a[1].T)

a = np.array([[1,2],[1,1]])
b = np.array([2,2])
print a
print b
print a.dot(b)

[4 5 5]
[[ 4  5  5]
 [ 4  5  5]
 [12 15 15]]
[[1 2]
 [1 1]]
[2 2]
[6 4]
