In [8]:
""" Imports """
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

"""Global definitons"""
_start = 'S_START'
_end = 'S_END'

In [9]:
""" util definitions"""

def hyperbolic(net):
    return np.tanh(net)

def relu(net):
    return np.maximum(0,net)

def softmax(net):
    _exp = np.exp(net)
    return _exp/np.sum(_exp)

def predict(scores):
    return np.argmax(scores)

In [10]:
class WordItem:
    def __init__(self,word,count=0):
        self.word = word
        self.count = count

In [25]:
class RNNlayer:
    
    """ 
    RNN nodes for decoder
    
    hidden state at time step t of decoder is conditioned on hidden state at time step t-1,
    output at time step t-1 and input at time step t
    """
    
    def __init__(self, inputSize, outputSize, W_Embedding_french, idx, bptt_truncate = 5, hiddenDim = 10):
        """
        inputSize = dimensions of the input embedding 
        outputSize = vocabulary size
        hiddenDim = size of the hidden unit in RNN
        bptt_truncate = truncate the number of time steps we calculate the gradient during backpropagation
        """
        self.inputSize = inputSize
        self.outputSize = outputSize
        self.W_Embedding_french = W_Embedding_french
        self.hiddenDim = hiddenDim
        self. bptt_truncate = bptt_truncate
        self.idx = idx;
        
        self.w_in = np.random.uniform(-np.sqrt(1./inputSize), np.sqrt(1./inputSize),(hiddenDim, inputSize))
        self.w_hh = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(hiddenDim, hiddenDim))
        self.w_outH = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(inputSize, hiddenDim))
        self.w_out = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
        
    def forwardProp(self, inSentence, expSent):
        """
        inSentence: word indices in input language vocabulary
        expSent: word indices in target language vocabulary
        """
        
        #Total number of time steps equal to number of words in the sentence
        T = len(inSentence)
        
        #Saving all hidden states and outputs during forward propagation
        _h = np.zeros((T,self.hiddenDim))
        _o = np.zeros((T,self.outputSize))
        
        #Initializing initial output as the start token
        _o[-1] = np.zeros(self.outputSize)
        _o[-1][idx] = 1;
        
        #For each time step calculating hidden state and output
        for t in np.arange(T):
            outIdx = predict(_o[t-1])
            _h[t] = hyperbolic(self.w_in.dot(inSentence[t]) + self.w_hh.dot(_h[t-1]) + self.w_outH.dot(self.W_Embedding_french[outIdx]))
            _o[t] = softmax(self.w_out.dot(_h[t]))
            
        return _o, _h
    
    def calculateLoss(self, inSentence, expSentence):
        
        #For each sentence
        o, h = self.forwardProp(inSentence, expSentence)
        #TODO recheck this part
        correctPred = o[np.arange(len(expSentence)), expSentence]
        #Loss for each sentence
        l = -1 * np.sum(np.log(correctPred))
        return l
    
    def calculateTotalLoss(self, inSentence, expSentences):
        
        L = 0.0
        for i in range(len(inSentence)):
            if len(inSentence[i]) == len(expSentences[i]) :
                L += self.calculateLoss(inSentence[i], expSentences[i])
            
        return L
    
    def backPropTT(self, inSentence, expSentence):
        
        # Total number of time steps equal to number of words in the sentence
        T = len(expSentence)
        
        # Performing forward propagation
        o, h = self.forwardProp(inSentence, expSentence)
        
        # Defining gradient variables
        dLdin = np.zeros(self.w_in.shape)
        dLdhh = np.zeros(self.w_hh.shape)
        dLdoutH = np.zeros(self.w_outH.shape)
        dLdout = np.zeros(self.w_out.shape)
        
        # Calculating the difference between output and actual output
        delta_o = o
        delta_o[np.arange(T), expSentence] -= 1
        #print 'delta_o', delta_o
        
        # Calculating gradients backwards through time
        for t in np.arange(T)[::-1]:
            #Output gradient is only dependent on time step t
            dLdout += np.outer(delta_o[t], h[t])
            
            # Initial delta calculation propagating gradients from output
            delta_t = self.w_out.T.dot(delta_o[t]) * (1 - (h[t] ** 2))
            
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
                # Add to gradients at each previous step
                dLdhh += np.outer(delta_t, h[bptt_step-1])              
                dLdin += np.outer(delta_t, inSentence[bptt_step-1])
                dLdoutH += np.outer(delta_t, self.W_Embedding_french[predict(o[t-1])])
                # Update delta for next step dL/dz at t-1
                delta_t = self.w_hh.T.dot(delta_t) * (1 - h[bptt_step-1] ** 2)
            """TODO review backprop implementation"""
            
        return dLdin, dLdhh, dLdoutH, dLdout
        #return dLdin, dLdhh, dLdout
    
    def sgd_step(self, inSentence, expSentence, learningRate):
        
        """ Performs a single stochastic gradient step"""
        
        # Calculating gradients
        dLdin, dLdhh, dLdoutH, dLdout = self.backPropTT(inSentence, expSentence)
        #dLdin, dLdhh, dLdout = self.backPropTT(inSentence, expSentence)
        
        # Updating parameters
        self.w_in -= learningRate * dLdin
        self.w_hh -= learningRate * dLdhh
        self.w_outH -= learningRate * dLdoutH
        self.w_out -= learningRate * dLdout
        
    def train_Decoder_With_SGD(self, X_train, Y_train, learningRate = 0.05, nepochs = 200):
        """TODO evaluate losses and update learning rate if required"""
        loss = 100000000000
        for epoch in range(nepochs):
            for i in range(len(Y_train)):
                if len(X_train[i]) == len(Y_train[i]) :
                    self.sgd_step(X_train[i], Y_train[i], learningRate)
            newLoss = self.calculateTotalLoss(X_train, Y_train)
            print epoch, " ", newLoss
            #if newLoss > loss :
            #    break
            #loss = newLoss

In [16]:
""" Word preprocessing """
def dataset(_fi='/home/jazzycrazzy/PythonScripts/dataset.csv', _fo = 'testfile.txt'):
    file_in = open(_fi)
    #file_out = open(_fo,'wb')

    words = [] #stores unique words encountered in the document as WordItem objects
    _dict = {} #temporary dictionary to maintain count of each word
    
    _dict['UNK'] = 0

    for l in file_in:
        #file_out.write(l+'\n')
        l = _start+' '+l+' '+_end
        split = word_tokenize(l.decode('utf-8'))
        for w in split:
            if len(w)==0:
                continue
            elif len(w) > 15: #if word's length is greater than 15 counting it as unknown
                _dict['UNK'] += 1
                continue
            if w not in _dict:
                _dict[w] = 1
            _dict[w] += 1
            
    _vocab = {} #dictionary with words as keys and values as indices of them in 'word' list
    _vocab['UNK'] = len(words)
    words.append(WordItem('UNK',_dict['UNK']))
    for k,v in _dict.iteritems():
        if v > 9 and k != 'UNK':
        #if k != 'UNK':
            _vocab[k] = len(words)
            words.append(WordItem(k,v))
        else:
            words[0].count += 1
    
    #cleaning up unnecessary memory
    del _dict
    file_in.close()
    #file_out.close()
    
    return _vocab, words

def UnigramTable(_vocab, words):
    """ Calculates probabilities based on count of each word present"""
    pow = 0.75
    totalFreqPow = 0.0
    unigramTable = {}
    
    l = [words[i].count**pow for i in range(len(_vocab))]
    totalFreqPow = np.sum(l)
    
    for i in range(len(_vocab)):
        unigramTable[i] = (words[i].count**pow)/totalFreqPow
    
    del l
    return unigramTable

def hotVector(wordIndex,vocabSize):
    """ Returns hot vector representation of a word """
    hVector = np.zeros(vocabSize)
    hVector[wordIndex-1] = 1
    return hVector

def softmax(net):
    """ calculates softmax score - target score normalized with noise scores and calculated as probability"""
    _exp = np.exp(net)
    return _exp/np.sum(_exp)

def sigmoid(net):
    """ Applies sigmoid logistic function on net """
    return 1.0/(1+np.exp(-net))

def randomIdx(k, vocabSize, current):
    """ Returns k indices from with unigram table randomly with respect to each word's probablity """
    global _unigramTable1
    idxs = list(np.random.choice(vocabSize, k+1, False, p = _unigramTable1.values()))
    if current in idxs:
        idxs.remove(current)
    else:
        del idxs[-1]
    return idxs
    
def softmaxCostGradient(net, target):
    prob = softmax(net)
    print(prob)
    
    
def negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k = 10):
    
    errorHidden = np.zeros(shape=(emb.size,1))
    
    actOut = sigmoid(out[context])
    negSamples = randomIdx(k, vocabSize, context)
    _negSamples = [-out[sample] for sample in negSamples]
    
    # error for context word
    e = -np.log(actOut) - np.sum(np.log(sigmoid(np.array(_negSamples))))
    
    """ calculating gradients for output vectors for both target and negative samples
    calculating hidden layer error for each context word """
    # Updating output weight vector for context word
    delta = actOut - 1
    errorHidden += delta * W_Output[:,context:context+1]
    W_Output[:,context:context+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
    
    # Updating output weight vectors for negative sampling
    for sample in negSamples:
        delta = sigmoid(out[sample])
        errorHidden += delta * W_Output[:,sample:sample+1]
        W_Output[:,sample:sample+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
    
    return errorHidden,e    
    
def skipgram(target,contextWords, vocabSize, learningRate, W_Embedding, W_Output):
    
    """
    will be called on each window with
    target: Target word index
    contextWords: Arrray of integers representing context words
    """
    loss = 0
    k = 10 #Number of negative samples
    emb = W_Embedding[target]
    out = np.matmul(emb,W_Output) # [1 x EmbSize].[EmbSize x VocabSize]
    #print out.shape
    _predicted = []
    EH = np.zeros(shape=(emb.size,1))
    for context in contextWords:
        #predicted = hotVector(context, vocabSize)
        #softmaxCostGradient(out,context)
        _EH,_e = negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k)
        EH += _EH
        loss += _e
        #EH += sof
        
    #updating hidden layer input vector embedding
    W_Embedding[target] -= learningRate * EH.T[0]
    return loss

In [14]:
""" Creates word embeddings in vector space representation """

""" Feedforward Neural Net Language model """
#Input layer

#Projection layer

#Hidden layer

#Output layer

#Initialization
fin='/Users/preethikapachaiyappa/Documents/MachineLearning/Data/English-equalLength1.txt'#/home/jazzycrazzy/PythonScripts/dataset.csv'
fin1='/Users/preethikapachaiyappa/Documents/MachineLearning/Data/French-equalLength1.txt'
fout = 'testfile.txt'
fout1 = 'testfile1.txt'
_vocab, words = dataset(fin, fout)
_vocab_f, words_f = dataset(fin1, fout1)
_unigramTable = UnigramTable(_vocab, words)
_unigramTable1 = UnigramTable(_vocab_f, words_f)

learningRate = 0.1
vocabSize = len(words)
vocabSize_f = len(words_f)
emb_size = 10
win_size = 2
target = None
contextWords = []
epoch = 20

print _vocab
print _vocab_f


# No need of hidden layer since when the embedding matrix is multiplied with hot vector 
#it essentially gives that embedding row
W_Embedding = np.random.randn(vocabSize,emb_size) #Embedding matrix
W_Output = np.random.randn(emb_size,vocabSize) #Outputlayer weight matrix Emb_size x Vocab

W_Embedding_f = np.random.randn(vocabSize_f,emb_size) #Embedding matrix
W_Output_f = np.random.randn(emb_size,vocabSize_f) #Outputlayer weight matrix Emb_size x Vocab

for _ in np.arange(epoch):
    
    totalLoss = 0
    loss = 0
    
    fileIn = open(fin)
    for l in fileIn:
        l = _start+' '+l+' '+_end
        tokens = word_tokenize(l.decode('utf-8'))
        #print 'tokens',tokens
        for token in tokens:
            
            loss = 0
            contextWords = []
            
            if token in _vocab:
                target = _vocab[token]
                trgtIdx = tokens.index(token)
                cntxtIdxs = range(trgtIdx-win_size, trgtIdx+win_size+1)
                cntxtIdxs.remove(trgtIdx)
                for idx in cntxtIdxs:
                    #check for first word and last word and use UNK for context words for window where words not available
                    if idx >-1 and idx < len(tokens) and tokens[idx] in _vocab:
                        contextWords = np.append(contextWords, _vocab[tokens[idx]])
                    else:
                        contextWords = np.append(contextWords, _vocab['UNK'])
                loss += skipgram(target, contextWords, vocabSize, learningRate, W_Embedding, W_Output)
        totalLoss += loss
    print 'Total Loss:',totalLoss

{u'limited': 1154, u'four': 1, u'whose': 767, u'crises': 1152, u'poorest': 1155, u'Western': 3, u'under': 1156, u'risk': 1157, u'Iranian': 394, u'regional': 4, u'every': 1159, u'reforms': 768, u'affect': 769, u'bringing': 28, u'1930\u2019s': 5, u'school': 1160, u'companies': 770, u'solution': 771, u'Paul': 6, u'Europeans': 1161, u'force': 772, u'leaders': 1162, u'direct': 1163, u'budget': 264, u'second': 7, u'even': 774, u'established': 813, u'decisions': 1358, u'contributed': 8, u'above': 396, u'Yeltsin': 819, u'new': 775, u'increasing': 53, u'ever': 776, u'told': 397, u'incentives': 134, u'men': 56, u'here': 9, u'hundreds': 828, u'protection': 398, u'active': 779, u'path': 833, u'100': 780, u'study': 399, u'changed': 1164, u'Hamas': 10, u'credit': 782, u'military': 11, u'changes': 1165, u'secure': 400, u'campaign': 784, u'Rather': 785, u'highly': 401, u'brought': 13, u'total': 402, u'would': 1166, u'army': 786, u'China\u2019s': 403, u'voters': 645, u'negative': 404, u'call': 787, u'a



Total Loss: 37472.3951729
Total Loss: 33952.3828929
Total Loss: 34079.606341
Total Loss: 34181.0310067
Total Loss: 34241.8576626
Total Loss: 34040.1619295
Total Loss: 34256.56869
Total Loss: 34312.5718729
Total Loss: 34115.0493199
Total Loss: 34136.7114393
Total Loss: 34189.9331039
Total Loss: 34240.7381353
Total Loss: 34201.6724879
Total Loss: 34242.018254
Total Loss: 34091.8234161
Total Loss: 34080.5613835
Total Loss: 34244.2514971
Total Loss: 34308.7619727
Total Loss: 34206.0111802
Total Loss: 34258.6154221


In [17]:
contextWords = []
    
for _ in np.arange(epoch):
    
    totalLoss = 0
    loss = 0
    
    fileIn = open(fin1)
    for l in fileIn:
        l = _start+' '+l+' '+_end
        tokens = word_tokenize(l.decode('utf-8'))
        #print 'tokens',tokens
        for token in tokens:
            
            loss = 0
            contextWords = []
            
            if token in _vocab_f:
                target = _vocab_f[token]
                trgtIdx = tokens.index(token)
                cntxtIdxs = range(trgtIdx-win_size, trgtIdx+win_size+1)
                cntxtIdxs.remove(trgtIdx)
                for idx in cntxtIdxs:
                    #check for first word and last word and use UNK for context words for window where words not available
                    if idx >-1 and idx < len(tokens) and tokens[idx] in _vocab_f:
                        contextWords = np.append(contextWords, _vocab_f[tokens[idx]])
                    else:
                        contextWords = np.append(contextWords, _vocab_f['UNK'])
                loss += skipgram(target, contextWords, vocabSize_f, learningRate, W_Embedding_f, W_Output_f)
        totalLoss += loss
    print 'Total Loss:',totalLoss
                

print(W_Embedding_f)

idx = _vocab_f[_start]



Total Loss: 35995.6176593
Total Loss: 33123.2675223
Total Loss: 33207.6133279
Total Loss: 33075.6164044
Total Loss: 33446.590819
Total Loss: 33200.581148
Total Loss: 33237.2641109
Total Loss: 33067.18647
Total Loss: 33262.5368821
Total Loss: 33205.10318
Total Loss: 33428.5308258
Total Loss: 33320.2787369
Total Loss: 33252.7893135
Total Loss: 33175.4004516
Total Loss: 33211.1464751
Total Loss: 33311.5987104
Total Loss: 33381.1454684
Total Loss: 33284.8474257
Total Loss: 33268.6784086
Total Loss: 33157.5456106
[[ 0.75156866  0.2517222  -0.68399158 ...,  0.92442616  0.60871534
  -0.09872267]
 [-0.67502403 -1.50688523  0.64144321 ...,  0.03777165  0.5064405
   0.60732242]
 [ 0.12958977  0.20589492  1.04009858 ..., -0.8653133  -0.35369848
  -0.52664951]
 ..., 
 [-0.0231531   1.04862207  0.30375387 ..., -1.05858505 -0.15990741
  -0.05112539]
 [-0.0831807   1.36102096  0.89386512 ...,  0.30678219  1.15645574
  -0.30025071]
 [-0.17060401 -0.02922701  0.12065742 ..., -0.74238374 -0.576227
  -0.

In [None]:
#mean_list = W_Embedding.mean(0)
#print mean_list

#W_Embedding_new = W_Embedding - mean_list
#print W_Embedding_new

In [26]:
inSentence = []
expSentence = []

fileIn0 = open(fin)
for l in fileIn0 :
    #l = _start+' '+l+' '+_end
    tokens = word_tokenize(l.decode('utf-8'))
    inSent = []
    for token in tokens :
        target = ""
        if token not in _vocab : 
            target = _vocab['UNK']
        else : 
            target = _vocab[token]
        vec = W_Embedding[target]
        vec_list = vec.tolist()
        inSent.append(vec_list)
    inSentence.append(inSent)

fileIn1 = open(fin1)
for l in fileIn1 :
    #l = _start+' '+l+' '+_end
    tokens = word_tokenize(l.decode('utf-8'))
    expSent = []
    for token in tokens :
        target = ""
        if token not in _vocab_f : 
            target = _vocab_f['UNK']
        else : 
            target = _vocab_f[token]
        expSent.append(target)
    expSentence.append(expSent)

#print inSentence
#print expSentence
        
a = RNNlayer(10,vocabSize_f,W_Embedding_f,idx)
a.train_Decoder_With_SGD(inSentence, expSentence, 0.1, 25)


0   123745.986541
1   125267.999661
2   129562.649855
3   128628.700571
4   126392.519699
5   128751.585344
6   126271.729103
7   127132.250987
8   127833.522475
9   126623.776688
10   131706.67726
11   126457.866613
12   124988.975815
13   123397.555218
14   132079.42921
15   128461.236529
16   125680.370529
17   125065.295053
18   122556.686152
19   128573.841962
20   125839.119875
21   127053.354018
22   126303.789215
23   129499.81539
24   131367.347243


In [34]:
inSentence = []
input = "American Parliament"
#target = _vocab[input]
#vec = W_Embedding_new[target]
#inSentence.append(vec)
tokens = word_tokenize(input.decode('utf-8'))
inSent = []
for token in tokens :
    target = _vocab[token]
    vec = W_Embedding[target]
    vec_list = vec.tolist()
    inSent.append(vec_list)
inSentence.append(inSent)
print inSentence

o,h = a.forwardProp(inSentence[0],None)
print o
words1 = o.argmax(axis=1)
print words1
for i in range(len(words1)) :
    print words_f[words1[i]].word

[[[-0.023195924361487638, -0.14407989065954266, 0.49097463076905157, -0.43153845176902067, 0.9646324114327803, -0.22700313499949196, 0.3492607853411439, 0.494048916612572, 0.32866152141974336, -0.5031084153348868], [0.737518627046467, -0.1031075717148144, -0.2667072526006662, -0.6326892444716452, 0.8513651367537206, -0.19226917849216824, 0.6697243635830863, 0.9355203332930365, 0.6054522914283671, -0.8305105622508467]]]
[[  1.98514423e-01   4.71606115e-04   3.05549984e-04 ...,   3.62127774e-05
    1.03240760e-04   3.62231086e-05]
 [  1.36222423e-01   6.10040712e-06   1.42902140e-04 ...,   4.02458211e-05
    4.04570752e-05   1.61371244e-04]]
[  0 372]
UNK
de
