In [1]:
""" Imports """
import re
from nltk.tokenize import word_tokenize, sent_tokenize
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

"""Global definitons"""
_start = 'S_START'
_end = 'S_END'

In [2]:
class WordItem:
    def __init__(self,word,count=0):
        self.word = word
        self.count = count

In [3]:
""" Word preprocessing """
def dataset(_fi='/home/jazzycrazzy/PythonScripts/dataset.csv', _fo = 'testfile.txt'):
    file_in = open(_fi)
    #file_out = open(_fo,'wb')

    words = [] #stores unique words encountered in the document as WordItem objects
    _dict = {} #temporary dictionary to maintain count of each word
    
    _dict['UNK'] = 0

    for l in file_in:
        #file_out.write(l+'\n')
        l = _start+' '+l+' '+_end
        split = word_tokenize(l)
        for w in split:
            if len(w)==0:
                continue
            elif len(w) > 15: #if word's length is greater than 15 counting it as unknown
                _dict['UNK'] += 1
                continue
            if w not in _dict:
                _dict[w] = 1
            _dict[w] += 1
            
    _vocab = {} #dictionary with words as keys and values as indices of them in 'word' list
    _vocab['UNK'] = len(words)
    words.append(WordItem('UNK',_dict['UNK']))
    for k,v in _dict.iteritems():
        if v > 9 and k != 'UNK':
            _vocab[k] = len(words)
            words.append(WordItem(k,v))
        else:
            words[0].count += 1
    
    #cleaning up unnecessary memory
    del _dict
    file_in.close()
    #file_out.close()
    
    return _vocab, words

def UnigramTable(_vocab, words):
    """ Calculates probabilities based on count of each word present"""
    pow = 0.75
    totalFreqPow = 0.0
    unigramTable = {}
    
    l = [words[i].count**pow for i in range(len(_vocab))]
    totalFreqPow = np.sum(l)
    
    for i in range(len(_vocab)):
        unigramTable[i] = (words[i].count**pow)/totalFreqPow
    
    del l
    return unigramTable

def hotVector(wordIndex,vocabSize):
    """ Returns hot vector representation of a word """
    hVector = np.zeros(vocabSize)
    hVector[wordIndex-1] = 1
    return hVector

def softmax(net):
    """ calculates softmax score - target score normalized with noise scores and calculated as probability"""
    _exp = np.exp(net)
    return _exp/np.sum(_exp)

def sigmoid(net):
    """ Applies sigmoid logistic function on net """
    return 1.0/(1+np.exp(-net))

def randomIdx(k, vocabSize, current):
    """ Returns k indices from with unigram table randomly with respect to each word's probablity """
    global _unigramTable
    idxs = list(np.random.choice(vocabSize, k+1, False, p = _unigramTable.values()))
    if current in idxs:
        idxs.remove(current)
    else:
        del idxs[-1]
    return idxs
    
def softmaxCostGradient(net, target):
    prob = softmax(net)
    
    
def negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k = 10):
    
    #cost = []
    errorHidden = np.zeros(shape=(emb.size,1))
    
    actOut = sigmoid(out[context])
    negSamples = randomIdx(k, vocabSize, context)
    _negSamples = [out[sample] for sample in negSamples]
    e = -np.log(actOut) - np.sum(np.log(sigmoid(np.negative(_negSamples))))
    #cost = np.concatenate(cost, e)
    
    """ calculating gradients for output vectors for both target and negative samples
    calculating hidden layer error for each context word """
    delta = actOut - 1
    errorHidden += delta * W_Output[:,context:context+1]
    W_Output[:,context:context+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
    for sample in negSamples:
        delta = sigmoid(out[sample])
        errorHidden += delta * W_Output[:,sample:sample+1]
        W_Output[:,sample:sample+1] -= learningRate * np.reshape(delta * emb,(emb.size,1))
    
    return errorHidden
    
def skipgram(target,contextWords, vocabSize, learningRate, W_Embedding, W_Output):
    
    """
    will be called on each window with
    target: Target word index
    contextWords: Arrray of integers representing context words
    """
    k = 10 #Number of negative samples
    emb = W_Embedding[target]
    out = np.matmul(emb,W_Output) # [1 x EmbSize].[EmbSize x VocabSize]
    _predicted = []
    EH = np.zeros(shape=(emb.size,1))
    for context in contextWords:
        #predicted = hotVector(context, vocabSize)
        EH += negSamplingCostGradient(out, context, emb, vocabSize, learningRate, W_Output, k)
        
    #updating hidden layer input vector embedding
    W_Embedding[target] -= learningRate * EH.T[0]


In [4]:

""" Creates word embeddings in vector space representation """

""" Feedforward Neural Net Language model """
#Input layer

#Projection layer

#Hidden layer

#Output layer

#Initialization
fin='/home/jazzycrazzy/MTData/English/9-11-in-perspective.txt'#/home/jazzycrazzy/PythonScripts/dataset.csv'
fout = 'testfile.txt'
_vocab, words = dataset(fin, fout)
_unigramTable = UnigramTable(_vocab, words)

learningRate = 0.2
vocabSize = len(words)
emb_size = 10
win_size = 2
target = None
contextWords = []

#print _vocab


# No need of hidden layer since when the embedding matrix is multiplied with hot vector 
#it essentially gives that embedding row
W_Embedding = np.random.randn(vocabSize,emb_size) #Embedding matrix
W_Output = np.random.randn(emb_size,vocabSize) #Outputlayer weight matrix Emb_size x Vocab

fileIn = open(fin)
for l in fileIn:
    l = _start+' '+l+' '+_end
    tokens = word_tokenize(l)
    print 'tokens',tokens
    for token in tokens:
        if token in _vocab:
            target = _vocab[token]
            trgtIdx = tokens.index(token)
            cntxtIdxs = range(trgtIdx-win_size, trgtIdx+win_size+1)
            cntxtIdxs.remove(trgtIdx)
            for idx in cntxtIdxs:
                if idx >-1 and idx < len(tokens) and tokens[idx] in _vocab:
                    contextWords = np.append(contextWords, _vocab[tokens[idx]])
                else:
                    contextWords = np.append(contextWords, _vocab['UNK']) 
            skipgram(target, contextWords, vocabSize, learningRate, W_Embedding, W_Output)


tokens ['S_START', '9/11', 'in', 'Perspective', 'S_END']
tokens ['S_START', 'NEW', 'YORK', '\xe2\x80\x93', 'It', 'was', 'a', 'decade', 'ago', 'that', '19', 'terrorists', 'took', 'control', 'of', 'four', 'planes', ',', 'flew', 'two', 'into', 'the', 'twin', 'towers', 'of', 'the', 'World', 'Trade', 'Center', ',', 'hit', 'the', 'Pentagon', 'with', 'a', 'third', ',', 'and', 'crashed', 'the', 'fourth', 'in', 'a', 'field', 'in', 'Pennsylvania', 'after', 'passengers', 'resisted', 'and', 'made', 'it', 'impossible', 'for', 'the', 'terrorists', 'to', 'complete', 'their', 'malevolent', 'mission', '.', 'S_END']




tokens ['S_START', 'In', 'a', 'matter', 'of', 'hours', ',', 'more', 'than', '3,000', 'innocent', 'people', ',', 'mostly', 'Americans', ',', 'but', 'also', 'people', 'from', '115', 'other', 'countries', ',', 'had', 'their', 'lives', 'suddenly', 'and', 'violently', 'taken', 'from', 'them', '.', 'S_END']
tokens ['S_START', 'September', '11', ',', '2001', ',', 'was', 'a', 'terrible', 'tragedy', 'by', 'any', 'measure', ',', 'but', 'it', 'was', 'not', 'a', 'historical', 'turning', 'point', '.', 'S_END']
tokens ['S_START', 'It', 'did', 'not', 'herald', 'a', 'new', 'era', 'of', 'international', 'relations', 'in', 'which', 'terrorists', 'with', 'a', 'global', 'agenda', 'prevailed', ',', 'or', 'in', 'which', 'such', 'spectacular', 'terrorist', 'attacks', 'became', 'commonplace', '.', 'S_END']
tokens ['S_START', 'On', 'the', 'contrary', ',', '9/11', 'has', 'not', 'been', 'replicated', '.', 'S_END']
tokens ['S_START', 'Despite', 'the', 'attention', 'devoted', 'to', 'the', '\xe2\x80\x9cGlobal', 'Wa



tokens ['S_START', 'But', 'what', 'may', 'be', 'most', 'important', ',', 'particularly', 'in', 'the', 'Arab', 'and', 'Islamic', 'communities', ',', 'is', 'to', 'end', 'any', 'acceptance', 'of', 'terrorism', '.', 'S_END']




tokens ['S_START', 'The', 'Nigerian', 'father', 'who', 'warned', 'the', 'US', 'embassy', 'in', 'Lagos', 'that', 'he', 'feared', 'what', 'his', 'own', 'son', 'might', 'do', '\xe2\x80\x93', 'before', 'that', 'same', 'young', 'man', 'attempted', 'to', 'detonate', 'a', 'bomb', 'aboard', 'a', 'flight', 'to', 'Detroit', 'on', 'Christmas', 'Day', '2009', '\xe2\x80\x93', 'is', 'an', 'example', 'of', 'just', 'this', '.', 'S_END']
tokens ['S_START', 'Only', 'when', 'more', 'parents', ',', 'teachers', ',', 'and', 'community', 'leaders', 'behave', 'likewise', 'will', 'recruitment', 'of', 'terrorists', 'dry', 'up', 'and', 'law-enforcement', 'authorities', 'receive', 'full', 'cooperation', 'from', 'the', 'populations', 'they', 'police', '.', 'S_END']
tokens ['S_START', 'Terrorism', 'must', 'lose', 'its', 'legitimacy', 'among', 'those', 'who', 'have', 'historically', 'supported', 'or', 'tolerated', 'it', 'before', 'it', 'will', 'lose', 'its', 'potency', '.', 'S_END']


In [None]:
"""print _unigramTable
print words[0].word,words[0].count
print _vocab.values()[:10]
print _vocab.keys()[:10]
print words[_vocab.get('UNK')].count

print _vocab
#print W_Embedding
fig = plt.figure()
plt.scatter(W_Embedding[:,0:1], W_Embedding[:,1:2], W_Embedding[:,2:3])
plt.show()"""