In [54]:
""" Imports """
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

"""Global definitons"""
_start = 'S_START'
_end = 'S_END'

In [55]:
""" util definitions"""

def hyperbolic(net):
    return np.tanh(net)

def relu(net):
    return np.maximum(0,net)

def softmax(net):
    _exp = np.exp(net)
    return _exp/np.sum(_exp)

def predict(scores):
    return np.argmax(scores)

In [56]:
class WordItem:
    def __init__(self,word,count=0):
        self.word = word
        self.count = count

In [57]:
class RNNlayer:
    
    """ 
    RNN nodes for decoder
    
    hidden state at time step t of decoder is conditioned on hidden state at time step t-1,
    output at time step t-1 and input at time step t
    """
    
    def __init__(self, inputSize, outputSize, bptt_truncate = 5, hiddenDim = 10):
        """
        inputSize = dimensions of the input embedding 
        outputSize = vocabulary size
        hiddenDim = size of the hidden unit in RNN
        bptt_truncate = truncate the number of time steps we calculate the gradient during backpropagation
        """
        self.inputSize = inputSize
        self.outputSize = outputSize
        self.hiddenDim = hiddenDim
        self. bptt_truncate = bptt_truncate
        
        self.w_in = np.random.uniform(-np.sqrt(1./inputSize), np.sqrt(1./inputSize),(hiddenDim, inputSize))
        self.w_hh = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(hiddenDim, hiddenDim))
        #self.w_outH = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
        self.w_out = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
        
    def forwardProp(self, inSentence, expSent):
        """
        inSentence: word indices in input language vocabulary
        expSent: word indices in target language vocabulary
        """
        
        #Total number of time steps equal to number of words in the sentence
        T = len(inSentence)
        
        #Saving all hidden states and outputs during forward propagation
        _h = np.zeros((T,self.hiddenDim))
        _o = np.zeros((T,self.outputSize))
        
        #Initializing initial output as the start token
        #_o[-1] = 
        
        #For each time step calculating hidden state and output
        for t in np.arange(T):
            #outIdx = predict(_o[t-1])
            _h[t] = hyperbolic(self.w_in.dot(inSentence[t]) + self.w_hh.dot(_h[t-1])) #+ self.w_outH[:,outIdx:outIdx+1])
            _o[t] = softmax(self.w_out.dot(_h[t]))
            
        return _o, _h
    
    def calculateLoss(self, inSentence, expSentence):
        
        #For each sentence
        o, h = self.forwardProp(inSentence, expSentence)
        #TODO recheck this part
        correctPred = o[np.arange(len(expSentence)), expSentence]
        #Loss for each sentence
        l = -1 * np.sum(np.log(correctPred))
        return l
    
    def calculateTotalLoss(self, inSentence, expSentences):
        
        L = 0.0
        for i in range(len(inSentence)):
            if len(inSentence[i]) == len(expSentences[i]) :
                L += self.calculateLoss(inSentence[i], expSentences[i])
            
        return L
    
    def backPropTT(self, inSentence, expSentence):
        
        # Total number of time steps equal to number of words in the sentence
        T = len(expSentence)
        
        # Performing forward propagation
        o, h = self.forwardProp(inSentence, expSentence)
        
        # Defining gradient variables
        dLdin = np.zeros(self.w_in.shape)
        dLdhh = np.zeros(self.w_hh.shape)
        #dLdoutH = np.zeros(self.w_outH.shape)
        dLdout = np.zeros(self.w_out.shape)
        
        # Calculating the difference between output and actual output
        delta_o = o
        delta_o[np.arange(T), expSentence] -= 1
        #print 'delta_o', delta_o
        
        # Calculating gradients backwards through time
        for t in np.arange(T)[::-1]:
            #Output gradient is only dependent on time step t
            dLdout += np.outer(delta_o[t], h[t])
            
            # Initial delta calculation propagating gradients from output
            delta_t = self.w_out.T.dot(delta_o[t]) * (1 - (h[t] ** 2))
            
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
                # Add to gradients at each previous step
                dLdhh += np.outer(delta_t, h[bptt_step-1])              
                dLdin += np.outer(delta_t, inSentence[bptt_step-1])
                #dLdoutH += np.outer(delta_t, o[bptt_step-1])
                # Update delta for next step dL/dz at t-1
                delta_t = self.w_hh.T.dot(delta_t) * (1 - h[bptt_step-1] ** 2)
            """TODO review backprop implementation"""
            
        #return dLdin, dLdhh, dLdoutH, dLdout
        return dLdin, dLdhh, dLdout
    
    def sgd_step(self, inSentence, expSentence, learningRate):
        
        """ Performs a single stochastic gradient step"""
        
        # Calculating gradients
        #dLdin, dLdhh, dLdoutH, dLdout = self.backPropTT(inSentence, expSentence)
        dLdin, dLdhh, dLdout = self.backPropTT(inSentence, expSentence)
        
        # Updating parameters
        self.w_in -= learningRate * dLdin
        self.w_hh -= learningRate * dLdhh
        #self.w_outH -= learningRate * dLdoutH
        self.w_out -= learningRate * dLdout
        
    def train_Decoder_With_SGD(self, X_train, Y_train, learningRate = 0.05, nepochs = 200):
        """TODO evaluate losses and update learning rate if required"""
        loss = 100000000000
        for epoch in range(nepochs):
            for i in range(len(Y_train)):
                if len(X_train[i]) == len(Y_train[i]) :
                    self.sgd_step(X_train[i], Y_train[i], learningRate)
            newLoss = self.calculateTotalLoss(X_train, Y_train)
            print epoch, " ", newLoss
            if newLoss > loss :
                break
            loss = newLoss

In [58]:
""" Word preprocessing """
def dataset(_fi='/home/jazzycrazzy/PythonScripts/dataset.csv', _fo = 'testfile.txt'):
    file_in = open(_fi)
    #file_out = open(_fo,'wb')

    words = [] #stores unique words encountered in the document as WordItem objects
    _dict = {} #temporary dictionary to maintain count of each word
    
    _dict['UNK'] = 0

    for l in file_in:
        #file_out.write(l+'\n')
        l = _start+' '+l+' '+_end
        split = word_tokenize(l.decode('utf-8'))
        for w in split:
            if len(w)==0:
                continue
            elif len(w) > 15: #if word's length is greater than 15 counting it as unknown
                _dict['UNK'] += 1
                continue
            if w not in _dict:
                _dict[w] = 1
            _dict[w] += 1
            
    _vocab = {} #dictionary with words as keys and values as indices of them in 'word' list
    _vocab['UNK'] = len(words)
    words.append(WordItem('UNK',_dict['UNK']))
    for k,v in _dict.iteritems():
        #if v > 9 and k != 'UNK':
        if k != 'UNK':
            _vocab[k] = len(words)
            words.append(WordItem(k,v))
        else:
            words[0].count += 1
    
    #cleaning up unnecessary memory
    del _dict
    file_in.close()
    #file_out.close()
    
    return _vocab, words

def UnigramTable(_vocab, words):
    """ Calculates probabilities based on count of each word present"""
    pow = 0.75
    totalFreqPow = 0.0
    unigramTable = {}
    
    l = [words[i].count**pow for i in range(len(_vocab))]
    totalFreqPow = np.sum(l)
    
    for i in range(len(_vocab)):
        unigramTable[i] = (words[i].count**pow)/totalFreqPow
    
    del l
    return unigramTable

def hotVector(wordIndex,vocabSize):
    """ Returns hot vector representation of a word """
    hVector = np.zeros(vocabSize)
    hVector[wordIndex-1] = 1
    return hVector

def softmax(net):
    """ calculates softmax score - target score normalized with noise scores and calculated as probability"""
    _exp = np.exp(net)
    return _exp/np.sum(_exp)

def sigmoid(net):
    """ Applies sigmoid logistic function on net """
    return 1.0/(1+np.exp(-net))

def randomIdx(k, vocabSize, current):
    """ Returns k indices from with unigram table randomly with respect to each word's probablity """
    global _unigramTable
    idxs = list(np.random.choice(vocabSize, k+1, False, p = _unigramTable.values()))
    if current in idxs:
        idxs.remove(current)
    else:
        del idxs[-1]
    return idxs
    
def softmaxCostGradient(net, target):
    prob = softmax(net)
    print(prob)
    
    
def negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k = 10):
    
    errorHidden = np.zeros(shape=(emb.size,1))
    
    actOut = sigmoid(out[context])
    negSamples = randomIdx(k, vocabSize, context)
    _negSamples = [-out[sample] for sample in negSamples]
    
    # error for context word
    e = -np.log(actOut) - np.sum(np.log(sigmoid(np.array(_negSamples))))
    
    """ calculating gradients for output vectors for both target and negative samples
    calculating hidden layer error for each context word """
    # Updating output weight vector for context word
    delta = actOut - 1
    errorHidden += delta * W_Output[:,context:context+1]
    W_Output[:,context:context+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
    
    # Updating output weight vectors for negative sampling
    for sample in negSamples:
        delta = sigmoid(out[sample])
        errorHidden += delta * W_Output[:,sample:sample+1]
        W_Output[:,sample:sample+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
    
    return errorHidden,e    
    
def skipgram(target,contextWords, vocabSize, learningRate, W_Embedding, W_Output):
    
    """
    will be called on each window with
    target: Target word index
    contextWords: Arrray of integers representing context words
    """
    loss = 0
    k = 10 #Number of negative samples
    emb = W_Embedding[target]
    out = np.matmul(emb,W_Output) # [1 x EmbSize].[EmbSize x VocabSize]
    #print out.shape
    _predicted = []
    EH = np.zeros(shape=(emb.size,1))
    for context in contextWords:
        #predicted = hotVector(context, vocabSize)
        #softmaxCostGradient(out,context)
        _EH,_e = negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k)
        EH += _EH
        loss += _e
        #EH += sof
        
    #updating hidden layer input vector embedding
    W_Embedding[target] -= learningRate * EH.T[0]
    return loss

In [59]:
""" Creates word embeddings in vector space representation """

""" Feedforward Neural Net Language model """
#Input layer

#Projection layer

#Hidden layer

#Output layer

#Initialization
fin='/Users/preethikapachaiyappa/Documents/MachineLearning/Data/English-small.txt'#/home/jazzycrazzy/PythonScripts/dataset.csv'
fin1='/Users/preethikapachaiyappa/Documents/MachineLearning/Data/French-small.txt'
fout = 'testfile.txt'
fout1 = 'testfile1.txt'
_vocab, words = dataset(fin, fout)
_vocab_f, words_f = dataset(fin1, fout1)
_unigramTable = UnigramTable(_vocab, words)

learningRate = 0.1
vocabSize = len(words)
vocabSize_f = len(words_f)
emb_size = 10
win_size = 2
target = None
contextWords = []
epoch = 20

print _vocab
print _vocab_f


# No need of hidden layer since when the embedding matrix is multiplied with hot vector 
#it essentially gives that embedding row
W_Embedding = np.random.randn(vocabSize,emb_size) #Embedding matrix
W_Output = np.random.randn(emb_size,vocabSize) #Outputlayer weight matrix Emb_size x Vocab

for _ in np.arange(epoch):
    
    totalLoss = 0
    loss = 0
    
    fileIn = open(fin)
    for l in fileIn:
        l = _start+' '+l+' '+_end
        tokens = word_tokenize(l.decode('utf-8'))
        #print 'tokens',tokens
        for token in tokens:
            
            loss = 0
            contextWords = []
            
            if token in _vocab:
                target = _vocab[token]
                trgtIdx = tokens.index(token)
                cntxtIdxs = range(trgtIdx-win_size, trgtIdx+win_size+1)
                cntxtIdxs.remove(trgtIdx)
                for idx in cntxtIdxs:
                    #check for first word and last word and use UNK for context words for window where words not available
                    if idx >-1 and idx < len(tokens) and tokens[idx] in _vocab:
                        contextWords = np.append(contextWords, _vocab[tokens[idx]])
                    else:
                        contextWords = np.append(contextWords, _vocab['UNK'])
                loss += skipgram(target, contextWords, vocabSize, learningRate, W_Embedding, W_Output)
        totalLoss += loss
    print 'Total Loss:',totalLoss
                

print(W_Embedding)

{u'enjoy': 1, u'S_END': 34, u'have': 3, u'tired': 4, u'ran': 5, u'is': 6, u'am': 8, u'How': 10, u'see': 11, u'at': 12, u'want': 13, u'in': 7, u'go': 15, u'your': 17, u'speak': 18, u'are': 65, u'what': 20, u'her': 21, u'how': 26, u'sun': 23, u'friends': 49, u'day': 50, u'graduate': 16, u'write': 19, u'to': 22, u'of': 61, u'enjoys': 28, u'has': 30, u'beach': 31, u'?': 32, u'she': 39, u'dad': 2, u'be': 35, u'we': 36, u'good': 37, u'tomorrow': 24, u'read': 9, u'student': 41, u'birth': 46, u'here': 42, u'every': 44, u'food': 45, u'mom': 25, u'date': 47, u'president': 48, 'UNK': 0, u'come': 27, u'you': 29, u'died': 51, u'he': 52, u'me': 43, u'boy': 54, u'store': 60, u'I': 38, u'name': 55, u'shop': 56, u'did': 57, u'S_START': 58, u'work': 59, u'a': 53, u'can': 33, u'night': 40, u'the': 62, u'nice': 63, u'where': 64, u'left': 14}
{u'comment': 1, u'votre': 2, u'peux': 3, u'aller': 4, u'dipl\xf4m\xe9': 5, u'La': 6, u'appelez': 7, u'venue': 66, u'naissance': 9, u'allez': 10, u'peut': 11, u'au': 1



Total Loss: 548.577064146
Total Loss: 377.797238059
Total Loss: 388.049088995
Total Loss: 372.10636965
Total Loss: 385.26746931
Total Loss: 401.391704975
Total Loss: 385.30087946
Total Loss: 399.192853474
Total Loss: 381.443801897
Total Loss: 385.430852802
Total Loss: 398.984780239
Total Loss: 397.283081357
Total Loss: 398.250030996
Total Loss: 393.6448626
Total Loss: 404.114884112
Total Loss: 388.503000725
Total Loss: 386.54853421
Total Loss: 385.404129374
Total Loss: 392.885982835
Total Loss: 401.044350713
[[ -1.02463943e+00  -1.00421919e+00  -2.95574953e-02   7.67953492e-01
   -1.30218626e+00  -8.61392017e-01  -8.64616979e-01   1.54902026e+00
    7.80616089e-01   1.05430431e-01]
 [ -1.86139489e+00   9.10745822e-02   6.38293466e-01  -2.52006939e+00
    2.45838195e-01  -3.76749787e-01  -7.50167314e-01   9.59485177e-01
    4.09441406e-01  -9.95104751e-01]
 [ -2.63922584e-01   2.83264421e-01   4.05438001e-01  -4.50828581e-01
   -1.20673584e-01  -7.97935303e-01   1.14141033e+00  -1.84813

In [60]:
mean_list = W_Embedding.mean(0)
print mean_list

W_Embedding_new = W_Embedding - mean_list
print W_Embedding_new

[-0.77099557  0.40361636  0.46337849 -0.82341657 -0.4389885   0.31120532
  0.69379779 -0.12915396  0.07898444 -1.37713765]
[[ -2.53643865e-01  -1.40783556e+00  -4.92935987e-01   1.59137006e+00
   -8.63197760e-01  -1.17259733e+00  -1.55841477e+00   1.67817423e+00
    7.01631647e-01   1.48256808e+00]
 [ -1.09039933e+00  -3.12541783e-01   1.74914974e-01  -1.69665282e+00
    6.84826695e-01  -6.87955103e-01  -1.44396511e+00   1.08863914e+00
    3.30456964e-01   3.82032901e-01]
 [  5.07072983e-01  -1.20351944e-01  -5.79404910e-02   3.72587986e-01
    3.18314916e-01  -1.10914062e+00   4.47612534e-01  -5.56595458e-02
   -1.18909515e+00  -1.87722622e-01]
 [ -5.66171148e-01  -1.88970111e+00  -9.61587385e-01   2.40865495e-01
    2.76767437e-04  -4.19112658e-01   1.81969814e+00  -1.02707883e-01
    5.02766106e-01  -5.83115903e-01]
 [  5.49182207e-01   3.63308929e-01   1.27785678e+00   3.09481425e-02
   -5.36646461e-01  -4.22279582e-03  -3.25361627e-01  -4.72313987e-01
    1.41090829e+00   4.855913

In [61]:
inSentence = []
expSentence = []

fileIn0 = open(fin)
for l in fileIn0 :
    #l = _start+' '+l+' '+_end
    tokens = word_tokenize(l.decode('utf-8'))
    inSent = []
    for token in tokens :
        target = ""
        if token not in _vocab : 
            target = _vocab['UNK']
        else : 
            target = _vocab[token]
        vec = W_Embedding_new[target]
        vec_list = vec.tolist()
        inSent.append(vec_list)
    inSentence.append(inSent)

fileIn1 = open(fin1)
for l in fileIn1 :
    #l = _start+' '+l+' '+_end
    tokens = word_tokenize(l.decode('utf-8'))
    expSent = []
    for token in tokens :
        target = ""
        if token not in _vocab_f : 
            target = _vocab_f['UNK']
        else : 
            target = _vocab_f[token]
        expSent.append(target)
    expSentence.append(expSent)

#print inSentence
#print expSentence
        
a = RNNlayer(10,vocabSize_f)
a.train_Decoder_With_SGD(inSentence, expSentence, 0.1, 25)



0   676.343121879
1   559.678232753
2   466.106003846
3   403.321027958
4   363.39722914
5   337.228165175
6   303.39501963
7   291.722530273
8   280.906331054
9   260.910654161
10   242.253290972
11   224.447399452
12   251.125510319


In [68]:
inSentence = []
input = "dad has left"
#target = _vocab[input]
#vec = W_Embedding_new[target]
#inSentence.append(vec)
tokens = word_tokenize(input.decode('utf-8'))
inSent = []
for token in tokens :
    target = _vocab[token]
    vec = W_Embedding_new[target]
    vec_list = vec.tolist()
    inSent.append(vec_list)
inSentence.append(inSent)
print inSentence

o,h = a.forwardProp(inSentence[0],None)
print o
words1 = o.argmax(axis=1)
for i in range(len(words1)) :
    print words_f[words1[i]].word

[[[0.5070729833579013, -0.12035194421675688, -0.05794049099940912, 0.37258798584359093, 0.3183149159821348, -1.1091406192035431, 0.44761253365321696, -0.05565954576060961, -1.1890951527451588, -0.18772262211994373], [0.17297527561999215, 0.2856855845545315, -0.07538509154828271, 0.9250753737318159, 1.04194390939685, 0.6311654301418484, 1.4391045818256725, 0.0964575018490263, -0.6082232638144489, 0.2097407502281834], [0.9954792761758973, 0.6970142998677618, 0.05406795835104378, 0.769962601255683, -0.9664534127529022, -0.39047834148867144, 0.720218732148162, -0.31330909170275634, -0.561919700032848, 0.4569521874179211]]]
[[  2.00794550e-04   1.67446555e-05   9.71852397e-04   1.26703128e-03
    9.32719924e-05   3.34391420e-04   1.37314300e-03   1.82754577e-04
    2.28267731e-03   1.25485734e-04   9.75763515e-05   3.01249722e-03
    5.47627321e-05   1.41111660e-04   2.68871825e-04   4.50327984e-04
    3.69132929e-04   1.67834087e-03   3.85446407e-04   2.20911894e-02
    8.38569850e-04   2.