In [1]:
""" Imports """
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

"""Global definitons"""
_start = 'S_START'
_end = 'S_END'

In [2]:
""" util definitions"""

def hyperbolic(net):
    return np.tanh(net)

def relu(net):
    return np.maximum(0,net)

def softmax(net):
    _exp = np.exp(net)
    return _exp/np.sum(_exp)

def predict(scores):
    return np.argmax(scores)

In [3]:
class WordItem:
    def __init__(self,word,count=0):
        self.word = word
        self.count = count

In [22]:
class RNNlayer:
    
    """ 
    RNN nodes for decoder
    
    hidden state at time step t of decoder is conditioned on hidden state at time step t-1,
    output at time step t-1 and input at time step t
    """
    
    def __init__(self, inputSize, outputSize, W_Embedding_french, idx, bptt_truncate = 5, hiddenDim = 10):
        """
        inputSize = dimensions of the input embedding 
        outputSize = vocabulary size
        hiddenDim = size of the hidden unit in RNN
        bptt_truncate = truncate the number of time steps we calculate the gradient during backpropagation
        """
        self.inputSize = inputSize
        self.outputSize = outputSize
        self.W_Embedding_french = W_Embedding_french
        self.hiddenDim = hiddenDim
        self. bptt_truncate = bptt_truncate
        self.idx = idx;
        
        self.w_in = np.random.uniform(-np.sqrt(1./inputSize), np.sqrt(1./inputSize),(hiddenDim, inputSize))
        self.w_hh = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(hiddenDim, hiddenDim))
        self.w_outH = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(inputSize, hiddenDim))
        self.w_out = np.random.uniform(-np.sqrt(1./hiddenDim), np.sqrt(1./hiddenDim),(outputSize, hiddenDim))
        
    def forwardProp(self, inSentence, expSent):
        """
        inSentence: word indices in input language vocabulary
        expSent: word indices in target language vocabulary
        """
        
        #Total number of time steps equal to number of words in the sentence
        T = len(inSentence)
        
        #Saving all hidden states and outputs during forward propagation
        _h = np.zeros((T,self.hiddenDim))
        _o = np.zeros((T,self.outputSize))
        
        #Initializing initial output as the start token
        _o[-1] = np.zeros(self.outputSize)
        _o[-1][idx] = 1;
        
        #For each time step calculating hidden state and output
        for t in np.arange(T):
            outIdx = predict(_o[t-1])
            _h[t] = hyperbolic(self.w_in.dot(inSentence[t]) + self.w_hh.dot(_h[t-1]) + self.w_outH.dot(self.W_Embedding_french[outIdx]))
            _o[t] = softmax(self.w_out.dot(_h[t]))
            
        return _o, _h
    
    def calculateLoss(self, inSentence, expSentence):
        
        #For each sentence
        o, h = self.forwardProp(inSentence, expSentence)
        #TODO recheck this part
        correctPred = o[np.arange(len(expSentence)), expSentence]
        #Loss for each sentence
        l = -1 * np.sum(np.log(correctPred))
        return l
    
    def calculateTotalLoss(self, inSentence, expSentences):
        
        L = 0.0
        for i in range(len(inSentence)):
            if len(inSentence[i]) == len(expSentences[i]) :
                L += self.calculateLoss(inSentence[i], expSentences[i])
            
        return L
    
    def backPropTT(self, inSentence, expSentence):
        
        # Total number of time steps equal to number of words in the sentence
        T = len(expSentence)
        
        # Performing forward propagation
        o, h = self.forwardProp(inSentence, expSentence)
        
        # Defining gradient variables
        dLdin = np.zeros(self.w_in.shape)
        dLdhh = np.zeros(self.w_hh.shape)
        dLdoutH = np.zeros(self.w_outH.shape)
        dLdout = np.zeros(self.w_out.shape)
        
        # Calculating the difference between output and actual output
        delta_o = o
        delta_o[np.arange(T), expSentence] -= 1
        #print 'delta_o', delta_o
        
        # Calculating gradients backwards through time
        for t in np.arange(T)[::-1]:
            #Output gradient is only dependent on time step t
            dLdout += np.outer(delta_o[t], h[t])
            
            # Initial delta calculation propagating gradients from output
            delta_t = self.w_out.T.dot(delta_o[t]) * (1 - (h[t] ** 2))
            
            # Backpropagation through time (for at most self.bptt_truncate steps)
            for bptt_step in np.arange(max(0, t-self.bptt_truncate), t+1)[::-1]:
                # print "Backpropagation step t=%d bptt step=%d " % (t, bptt_step)
                # Add to gradients at each previous step
                dLdhh += np.outer(delta_t, h[bptt_step-1])              
                dLdin += np.outer(delta_t, inSentence[bptt_step-1])
                dLdoutH += np.outer(delta_t, self.W_Embedding_french[predict(o[t-1])])
                # Update delta for next step dL/dz at t-1
                delta_t = self.w_hh.T.dot(delta_t) * (1 - h[bptt_step-1] ** 2)
            """TODO review backprop implementation"""
            
        return dLdin, dLdhh, dLdoutH, dLdout
        #return dLdin, dLdhh, dLdout
    
    def sgd_step(self, inSentence, expSentence, learningRate):
        
        """ Performs a single stochastic gradient step"""
        
        # Calculating gradients
        dLdin, dLdhh, dLdoutH, dLdout = self.backPropTT(inSentence, expSentence)
        #dLdin, dLdhh, dLdout = self.backPropTT(inSentence, expSentence)
        
        # Updating parameters
        self.w_in -= learningRate * dLdin
        self.w_hh -= learningRate * dLdhh
        self.w_outH -= learningRate * dLdoutH
        self.w_out -= learningRate * dLdout
        
    def train_Decoder_With_SGD(self, X_train, Y_train, learningRate = 0.05, nepochs = 200):
        """TODO evaluate losses and update learning rate if required"""
        loss = 100000000000
        for epoch in range(nepochs):
            for i in range(len(Y_train)):
                if len(X_train[i]) == len(Y_train[i]) :
                    self.sgd_step(X_train[i], Y_train[i], learningRate)
            newLoss = self.calculateTotalLoss(X_train, Y_train)
            print epoch, " ", newLoss
            #if newLoss > loss :
            #    break
            #loss = newLoss

In [73]:
""" Word preprocessing """
def dataset(_fi='/home/jazzycrazzy/PythonScripts/dataset.csv', _fo = 'testfile.txt'):
    file_in = open(_fi)
    #file_out = open(_fo,'wb')

    words = [] #stores unique words encountered in the document as WordItem objects
    _dict = {} #temporary dictionary to maintain count of each word
    
    _dict['UNK'] = 0

    for l in file_in:
        #file_out.write(l+'\n')
        l = _start+' '+l+' '+_end
        split = word_tokenize(l.decode('utf-8'))
        for w in split:
            if len(w)==0:
                continue
            elif len(w) > 15: #if word's length is greater than 15 counting it as unknown
                _dict['UNK'] += 1
                continue
            if w not in _dict:
                _dict[w] = 1
            _dict[w] += 1
            
    _vocab = {} #dictionary with words as keys and values as indices of them in 'word' list
    _vocab['UNK'] = len(words)
    words.append(WordItem('UNK',_dict['UNK']))
    for k,v in _dict.iteritems():
        if v > 9 and k != 'UNK':
        #if k != 'UNK':
            _vocab[k] = len(words)
            words.append(WordItem(k,v))
        else:
            words[0].count += 1
    
    #cleaning up unnecessary memory
    del _dict
    file_in.close()
    #file_out.close()
    
    return _vocab, words

def UnigramTable(_vocab, words):
    """ Calculates probabilities based on count of each word present"""
    pow = 0.75
    totalFreqPow = 0.0
    unigramTable = {}
    
    l = [words[i].count**pow for i in range(len(_vocab))]
    totalFreqPow = np.sum(l)
    
    for i in range(len(_vocab)):
        unigramTable[i] = (words[i].count**pow)/totalFreqPow
    
    del l
    return unigramTable

def hotVector(wordIndex,vocabSize):
    """ Returns hot vector representation of a word """
    hVector = np.zeros(vocabSize)
    hVector[wordIndex-1] = 1
    return hVector

def softmax(net):
    """ calculates softmax score - target score normalized with noise scores and calculated as probability"""
    _exp = np.exp(net)
    return _exp/np.sum(_exp)

def sigmoid(net):
    """ Applies sigmoid logistic function on net """
    return 1.0/(1+np.exp(-net))

def randomIdx(k, vocabSize, current):
    """ Returns k indices from with unigram table randomly with respect to each word's probablity """
    global _unigramTable1
    idxs = list(np.random.choice(vocabSize, k+1, False, p = _unigramTable1.values()))
    if current in idxs:
        idxs.remove(current)
    else:
        del idxs[-1]
    return idxs
    
def softmaxCostGradient(net, target):
    prob = softmax(net)
    print(prob)
    
    
def negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k = 10):
    
    errorHidden = np.zeros(shape=(emb.size,1))
    
    actOut = sigmoid(out[context])
    negSamples = randomIdx(k, vocabSize, context)
    _negSamples = [-out[sample] for sample in negSamples]
    
    # error for context word
    e = -np.log(actOut) - np.sum(np.log(sigmoid(np.array(_negSamples))))
    
    """ calculating gradients for output vectors for both target and negative samples
    calculating hidden layer error for each context word """
    # Updating output weight vector for context word
    delta = actOut - 1
    errorHidden += delta * W_Output[:,context:context+1]
    W_Output[:,context:context+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
    
    # Updating output weight vectors for negative sampling
    for sample in negSamples:
        delta = sigmoid(out[sample])
        errorHidden += delta * W_Output[:,sample:sample+1]
        W_Output[:,sample:sample+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
    
    return errorHidden,e    
    
def skipgram(target,contextWords, vocabSize, learningRate, W_Embedding, W_Output):
    
    """
    will be called on each window with
    target: Target word index
    contextWords: Arrray of integers representing context words
    """
    loss = 0
    k = 10 #Number of negative samples
    emb = W_Embedding[target]
    out = np.matmul(emb,W_Output) # [1 x EmbSize].[EmbSize x VocabSize]
    #print out.shape
    _predicted = []
    EH = np.zeros(shape=(emb.size,1))
    for context in contextWords:
        #predicted = hotVector(context, vocabSize)
        #softmaxCostGradient(out,context)
        _EH,_e = negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k)
        EH += _EH
        loss += _e
        #EH += sof
        
    #updating hidden layer input vector embedding
    W_Embedding[target] -= learningRate * EH.T[0]
    return loss

In [72]:
""" Creates word embeddings in vector space representation """

""" Feedforward Neural Net Language model """
#Input layer

#Projection layer

#Hidden layer

#Output layer

#Initialization
fin='/Users/preethikapachaiyappa/Documents/MachineLearning/Data/English-equalLength2.txt'#/home/jazzycrazzy/PythonScripts/dataset.csv'
fin1='/Users/preethikapachaiyappa/Documents/MachineLearning/Data/French-equalLength2.txt'
fout = 'testfile.txt'
fout1 = 'testfile1.txt'
_vocab, words = dataset(fin, fout)
_vocab_f, words_f = dataset(fin1, fout1)
_unigramTable = UnigramTable(_vocab, words)
_unigramTable1 = UnigramTable(_vocab_f, words_f)

learningRate = 0.1
vocabSize = len(words)
vocabSize_f = len(words_f)
emb_size = 10
win_size = 4
target = None
contextWords = []
epoch = 20

#print _vocab
#print _vocab_f


# No need of hidden layer since when the embedding matrix is multiplied with hot vector 
#it essentially gives that embedding row
W_Embedding = np.random.randn(vocabSize,emb_size) #Embedding matrix
W_Output = np.random.randn(emb_size,vocabSize) #Outputlayer weight matrix Emb_size x Vocab

W_Embedding_f = np.random.randn(vocabSize_f,emb_size) #Embedding matrix
W_Output_f = np.random.randn(emb_size,vocabSize_f) #Outputlayer weight matrix Emb_size x Vocab

oldLoss = 10000
for _ in np.arange(epoch):
    
    totalLoss = 0
    loss = 0
    
    fileIn = open(fin)
    for l in fileIn:
        l = _start+' '+l+' '+_end
        tokens = word_tokenize(l.decode('utf-8'))
        #print 'tokens',tokens
        for token in tokens:
            
            loss = 0
            contextWords = []
            cntxtIdxs = []
            
            if token in _vocab:
                target = _vocab[token]
                trgtIdx = tokens.index(token)
                
                count = 0
                _idx = trgtIdx-1
                #print _idx
                while count < win_size and _idx >= 0:
                    if tokens[_idx] in _vocab:
                        cntxtIdxs = np.insert(cntxtIdxs,0,_idx)
                        count += 1
                    _idx -= 1
                #count = 0
                #_idx = trgtIdx + 1
                #while count < win_size and _idx < len(tokens):
                #    if tokens[_idx] in _vocab:
                #        cntxtIdxs = np.append(cntxtIdxs,_idx)
                #       count += 1
                #    _idx += 1
                    
                for idx in cntxtIdxs:
                    #print idx
                    #check for first word and last word and use UNK for context words for window where words not available
                    if idx >-1 and idx < len(tokens) and tokens[int(idx)] in _vocab:
                        contextWords = np.append(contextWords, _vocab[tokens[int(idx)]])
                    else:
                        contextWords = np.append(contextWords, _vocab['UNK'])
                #print contextWords
                loss += skipgram(target, contextWords, vocabSize, learningRate, W_Embedding, W_Output)
        totalLoss += loss
    print 'Total Loss:',totalLoss
    if totalLoss > oldLoss : 
        break;
    oldLoss = totalLoss



Total Loss: 1885.04176453
Total Loss: 1130.19097158
Total Loss: 1113.26582274
Total Loss: 1103.69266895
Total Loss: 1114.53043293


In [74]:
contextWords = []
    
oldLoss = 10000
for _ in np.arange(epoch):
    
    totalLoss = 0
    loss = 0
    
    fileIn = open(fin1)
    for l in fileIn:
        l = _start+' '+l+' '+_end
        tokens = word_tokenize(l.decode('utf-8'))
        #print 'tokens',tokens
        for token in tokens:
            loss = 0
            contextWords = []
            cntxtIdxs = []
            
            if token in _vocab_f:
                target = _vocab_f[token]
                trgtIdx = tokens.index(token)
                
                count = 0
                _idx = trgtIdx-1
                #print _idx
                while count < win_size and _idx >= 0:
                    if tokens[_idx] in _vocab_f:
                        cntxtIdxs = np.insert(cntxtIdxs,0,_idx)
                        count += 1
                    _idx -= 1
                #count = 0
                #_idx = trgtIdx + 1
                #while count < win_size and _idx < len(tokens):
                #    if tokens[_idx] in _vocab_f:
                #        cntxtIdxs = np.append(cntxtIdxs,_idx)
                #        count += 1
                #    _idx += 1
                    
                for idx in cntxtIdxs:
                    #print idx
                    #check for first word and last word and use UNK for context words for window where words not available
                    if idx >-1 and idx < len(tokens) and tokens[int(idx)] in _vocab_f:
                        contextWords = np.append(contextWords, _vocab_f[tokens[int(idx)]])
                    else:
                        contextWords = np.append(contextWords, _vocab['UNK'])
                #print contextWords
                loss += skipgram(target, contextWords, vocabSize_f, learningRate, W_Embedding_f, W_Output_f)
        totalLoss += loss
    print 'Total Loss:',totalLoss
    if totalLoss > oldLoss : 
        break;
    oldLoss = totalLoss
                

print(W_Embedding_f)

idx = _vocab_f[_start]



Total Loss: 2142.39385688
Total Loss: 1157.78182625
Total Loss: 1114.58410584
Total Loss: 1129.81236869
[[-1.56450208  2.82866243 -0.28745095 ..., -0.64462012 -0.36028558
   0.36278287]
 [ 0.99605632  2.89832133 -0.21634306 ..., -1.82747077  0.41848256
  -1.90786567]
 [-0.23089094  0.73134246 -0.08784773 ..., -1.45206524 -0.71318284
  -0.3407588 ]
 ..., 
 [-0.91927069  1.18149855 -0.18588161 ..., -0.77579122  0.21466177
  -0.0281404 ]
 [-1.41850957  1.80761586 -2.3898169  ..., -0.52572361  0.32095744
  -0.7096624 ]
 [-0.55791996  0.82140119 -1.63690938 ..., -0.30150419 -0.29358689
   1.13489885]]


In [None]:
#mean_list = W_Embedding.mean(0)
#print mean_list

#W_Embedding_new = W_Embedding - mean_list
#print W_Embedding_new

In [75]:
inSentence = []
expSentence = []

fileIn0 = open(fin)
for l in fileIn0 :
    #l = _start+' '+l+' '+_end
    tokens = word_tokenize(l.decode('utf-8'))
    inSent = []
    for token in tokens :
        target = ""
        if token not in _vocab : 
            target = _vocab['UNK']
        else : 
            target = _vocab[token]
        vec = W_Embedding[target]
        vec_list = vec.tolist()
        inSent.append(vec_list)
    inSentence.append(inSent)

fileIn1 = open(fin1)
for l in fileIn1 :
    #l = _start+' '+l+' '+_end
    tokens = word_tokenize(l.decode('utf-8'))
    expSent = []
    for token in tokens :
        target = ""
        if token not in _vocab_f : 
            target = _vocab_f['UNK']
        else : 
            target = _vocab_f[token]
        expSent.append(target)
    expSentence.append(expSent)

#print inSentence
#print expSentence
        
a = RNNlayer(10,vocabSize_f,W_Embedding_f,idx)
a.train_Decoder_With_SGD(inSentence, expSentence, 0.1, 25)


0   4211.83241621
1   4041.40150395
2   4024.99919122
3   3895.17035137
4   3849.28951216
5   3848.74189257
6   3843.09477941
7   3861.3252227
8   3814.03103458
9   3781.57461909
10   3785.93635126
11   3756.32625024
12   3786.64057914
13   3949.53674101
14   3794.04687334
15   3813.37644432
16   3641.3020529
17   3637.83255795
18   3677.91086653
19   3853.56982627
20   3668.86572163
21   3540.73737278
22   3521.07750759
23   3481.96691822
24   3426.68341248


In [79]:
print _vocab

inSentence = []
input = "assistance"
#target = _vocab[input]
#vec = W_Embedding_new[target]
#inSentence.append(vec)
tokens = word_tokenize(input.decode('utf-8'))
inSent = []
for token in tokens :
    target = _vocab[token]
    vec = W_Embedding[target]
    vec_list = vec.tolist()
    inSent.append(vec_list)
inSentence.append(inSent)
print inSentence

o,h = a.forwardProp(inSentence[0],None)
#print o
words1 = o.argmax(axis=1)
#print words1
for i in range(len(words1)) :
    print words_f[words1[i]].word

{u'limited': 1, u'all': 2, u'coach': 3, u'global': 4, u'9/11': 5, u'month': 6, u'appetite': 7, u'adjustment': 8, u'religious': 9, u'whose': 10, u'catastrophe': 11, u'zone': 12, u'passage': 13, u'literary': 14, u'to': 15, u'finally': 16, u'program': 17, u'under': 18, u'Not': 19, u'dominated': 221, u'include': 21, u'belonging': 22, u'risk': 23, u'very': 24, u'Political': 347, u'fan': 26, u'reforms': 27, u'affect': 28, u'screaming': 29, u'drag': 699, u'1930\u2019s': 31, u'Protocol': 32, u'level': 33, u'try': 34, u'race': 35, u'quick': 36, u'Turkey': 37, u'force': 38, u'leaders': 39, u'direct': 40, u'value': 475, u'replicated': 42, u'investment': 43, u'even': 44, u'will': 476, u'deliberate': 46, u'decisions': 47, u'assistance': 48, u'contributed': 49, u'debate': 351, u'access': 51, u'toll': 52, u'resilient': 53, u'new': 54, u';': 55, u'contributes': 56, u'niche': 58, u'proliferation': 59, u'never': 60, u'We': 356, u'here': 62, u'identifications': 64, u'protection': 65, u'English': 203, u'a