In [1]:
""" 
Here, as I am following the paper titled "word2vec Parameter Learning Explained" by Xin Ron,
it will be referred to as [1] from here onward in subsequent comments for the code.
"""
import theano
import theano.tensor as T
import numpy as np

Using gpu device 0: GeForce GT 645M (CNMeM is disabled, cuDNN 5105)


In [7]:
# dimension of the embedding vector
embeddingSize = 64
vocabSize = 10000

In [3]:
import dataset

In [4]:
words = dataset.produceWords()

In [5]:
words[:10]

['harry',
 'potter',
 'and',
 'the',
 'sorcerers',
 'stone',
 'by',
 'j',
 'k',
 'rowling']

## At this point in the code, we have a list 'words' of all words present in our text.
## Now,  we need to create a dataset from 'words' to use in training the model

In [6]:
import collections
len(collections.Counter(words))

23091

In [8]:
import collections
# CREATE DATASET
print "creating dataset..." 
def createDataset(words):
    """
    RETURN :  
    
    data- 
        A list of tags for each word, denoting its rank by count in the corpus of text availbale.
        By rank I mean a word's place when they have been sorted according to their count of occurences in the text.
        So for example, if the word 'I' has rank 20 and the word 'am' has rank 60, then a portion of the text 'I am'
        will be returned as [ 20, 60 ]
    """
    # I will use 0 for rank of words not in the top <vocabSize> words by their counts, and call them 'rare words'
    
    # here I use the symbol 'UNK' for the rare words
    counts = [['UNK', -1]]
    counts.extend(collections.Counter(words).most_common(vocabSize - 1))
    ranks = dict()
    for word, count in counts:
        ranks[word] = len(ranks)
    data = list()
    # for keeping count of rare words
    rareCount = 0
    for word in words:
        if word in ranks:
            index = ranks[word]
        else:
            index = 0  # ranks['UNK']
            rareCount += 1
        data.append(index)
    counts[0][1] = rareCount
    ranksToWords = dict(zip(ranks.values(), ranks.keys()))
    return data, counts, ranks, ranksToWords

creating dataset...


In [9]:
data, counts, ranks, ranksToWords = createDataset(words)
del words  # Hint to reduce memory.
print 'Most common words (+UNK)', counts[:5]

Most common words (+UNK) [['UNK', 21052], ('the', 51920), ('and', 27607), ('to', 26856), ('of', 21844)]


In [10]:
counts[-50:]

[('selfish', 4),
 ('comparing', 4),
 ('24', 4),
 ('25', 4),
 ('26', 4),
 ('23', 4),
 ('28', 4),
 ('29', 4),
 ('mystry', 4),
 ('harp', 4),
 ('conducted', 4),
 ('doctors', 4),
 ('theft', 4),
 ('sill', 4),
 ('forthcoming', 4),
 ('fans', 4),
 ('distraught', 4),
 ('kidnapped', 4),
 ('craved', 4),
 ('vindictive', 4),
 ('discipline', 4),
 ('dismally', 4),
 ('midsentence', 4),
 ('harassed', 4),
 ('coil', 4),
 ('accuse', 4),
 ('numbness', 4),
 ('enviously', 4),
 ('f', 4),
 ('lamb', 4),
 ('orblike', 4),
 ('bins', 4),
 ('acceleration', 4),
 ('mix', 4),
 ('whooshed', 4),
 ('relevant', 4),
 ('ushering', 4),
 ('bulk', 4),
 ('bull', 4),
 ('volunteer', 4),
 ('fleshy', 4),
 ('obliterated', 4),
 ('vegetables', 4),
 ('filius', 4),
 ('phial', 4),
 ('jack', 4),
 ('roomy', 4),
 ('represented', 4),
 ('swings', 4),
 ('noting', 4)]

In [11]:
print('Sample data', [ranksToWords[i] for i in data[:10]])
print('Sample data', [i for i in data[:10]])

('Sample data', ['harry', 'potter', 'and', 'the', 'sorcerers', 'stone', 'by', 'j', 'k', 'rowling'])
('Sample data', [7, 134, 2, 1, 2788, 347, 72, 5682, 6750, 6873])


In [12]:
import math
# class for a table which will be used to draw out negative samples.
class tableForNegativeSamples:
    def __init__(self, counts):
        # from Mikolov et al.'s original word2vec implementation, where they use 
        # a unigram distribution raised to power 3/4 to construct negative samples
        power = 0.75
        norm = sum([math.pow(t[1], power) for t in counts]) # Normalizing constants
        
        # tableSize should be big enough so that the minimum probability, i.e. 
        # (unigram)^(3/4) for a word multiplied by tableSize comes out to be atleast 1.
        tableSize = 1e8
        table = np.zeros(tableSize, dtype=np.uint16)

        p = 0 # Cumulative probability
        i = 0
        for word, count in counts:
            p += float(math.pow(count, power))/norm
            # fill the word in the table in the between the 
            # markings drawn out by cumulative probabilities
            while i < tableSize and float(i) / tableSize < p:
                table[i] = ranks[word]
                i += 1
        self.table = table

    def sample(self, k):
        indices = np.random.randint(low=0, high=len(self.table), size=k)
        return [self.table[i] for i in indices]

In [13]:
table = tableForNegativeSamples(counts)



In [14]:
table.sample(5)

[1067, 485, 5687, 954, 8710]

In [15]:
# come back and modify this function to include negative samples in a batch
import random

def generateBatch(positiveSampleSize, skipWindow, kNegativeSamples):
    """
    PARAMETERS : 
    positiveSampleSize - length of the window which will be sliding on the
                    continuous stream of words to generate a batch
    skipWindow - the number of context words to be considered on either 
                either side of the taget word
    
    RETURNS :
    batch - list of length = positiveSampleSize*(1 + kNegativeSamples)
            consisting of tuples (target, context) and including negative
            samples
    labels - list of 0s, 1s. 1 for positive sample, 0 for negative sample.
    """    
    global dataIndex
    assert positiveSampleSize % (2*skipWindow) == 0
    batch = []
    labels = []
    span = 2 * skipWindow + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[dataIndex])
        dataIndex = (dataIndex + 1) % len(data)
    for i in range(positiveSampleSize/(2*skipWindow)):
        context = skipWindow  # target label at the center of the buffer
        contextsToAvoid = [skipWindow]
        for j in range(2*skipWindow):
            while context in contextsToAvoid:
                context = random.randint(0, span-1)
            contextsToAvoid.append(context)
            positiveSample = (buffer[skipWindow], buffer[context])
            batch.append(positiveSample)
            labels.append(1)
            # attach negative samples
            negativeSamples = table.sample(kNegativeSamples)
            for i in range(kNegativeSamples):
                negativeSample = (buffer[skipWindow], negativeSamples[i])
                batch.append(negativeSample)
                labels.append(0)
        buffer.append(data[dataIndex])
        dataIndex = (dataIndex + 1) % len(data)
    return batch, labels

In [17]:
positiveSampleSize = 8
kNegativeSamples = 5
skipWindow = 1
batch, labels = generateBatch(positiveSampleSize=positiveSampleSize, skipWindow=skipWindow, 
                              kNegativeSamples = kNegativeSamples)

for i in range(len(batch)):
    print(batch[i][0], ranksToWords[batch[i][0]],
        '->', batch[i][1], ranksToWords[batch[i][1]])

(134, 'potter', '->', 2, 'and')
(134, 'potter', '->', 3, 'to')
(134, 'potter', '->', 204, 'few')
(134, 'potter', '->', 306, 'am')
(134, 'potter', '->', 207, 'mcgonagall')
(134, 'potter', '->', 6591, 'moodily')
(134, 'potter', '->', 7, 'harry')
(134, 'potter', '->', 1040, 'marble')
(134, 'potter', '->', 436, 'hadnt')
(134, 'potter', '->', 1651, 'warm')
(134, 'potter', '->', 37, 'them')
(134, 'potter', '->', 3042, 'pleasantly')
(2, 'and', '->', 1, 'the')
(2, 'and', '->', 69, 'would')
(2, 'and', '->', 41, 'been')
(2, 'and', '->', 17, 'at')
(2, 'and', '->', 3333, 'inner')
(2, 'and', '->', 2550, 'gargoyle')
(2, 'and', '->', 134, 'potter')
(2, 'and', '->', 101, 'going')
(2, 'and', '->', 48, 'who')
(2, 'and', '->', 912, 'reason')
(2, 'and', '->', 1557, 'list')
(2, 'and', '->', 199, 'let')
(1, 'the', '->', 2, 'and')
(1, 'the', '->', 323, 'quickly')
(1, 'the', '->', 65, 'do')
(1, 'the', '->', 34, 'all')
(1, 'the', '->', 202, 'three')
(1, 'the', '->', 1539, 'fresh')
(1, 'the', '->', 2788, 'sorce

## Hopefully, the core of the implementation is correct (the code below). Now what is needed is to pass data to this function in batches and design the training loop.

In [18]:
## SOME CONSTANTS 

# Will be modified as a global variable by the generateBatch function  
dataIndex = 0
# Experimental Theano Code (psuedo code)
batchSize = positiveSampleSize*(1 + kNegativeSamples)

###############################
### THE CORE IMPLEMENTATION ###
###############################

# the W matrix of the inputVectors as used in [1]
targetEmbeddings = theano.shared(np.random.uniform(-1, 1, (vocabSize, embeddingSize)))
# the W' matrix of the outputVectors as used in [1]
contextEmbeddings = theano.shared(np.random.normal(scale = 1.0/np.sqrt(vocabSize), 
                                                   size = (embeddingSize, vocabSize)))

# A |batchSize x 2| dimensional matrix, having (traget, context) pairs for
# a batch (including) -ve samples. This is the input to the training function .
targetContext = T.imatrix()

# the |batchSize x 1| vector, trainig labels (also an input to the training
# function), whether the context word matches the target word or not
isContext = T.bvector()

batchMatchScores = []

for i in range(batchSize):
    matchScore = T.dot(targetEmbeddings[targetContext[i][0],:], contextEmbeddings[:,targetContext[i][1]])
    batchMatchScores.append(matchScore)

objective = isContext*T.log(T.nnet.sigmoid(batchMatchScores)) + \
                (1 - isContext)*T.log(1-T.nnet.sigmoid(batchMatchScores))

loss = -T.mean(objective)

In [19]:
# TRAINING FUNCTION
from lasagne.updates import nesterov_momentum
updates = nesterov_momentum(loss, [targetEmbeddings, contextEmbeddings], learning_rate = 0.1, momentum = 0.9)
trainBatch = theano.function([targetContext, isContext], loss, updates = updates)

In [20]:
numberOfBatches = len(data)/(positiveSampleSize/(2*skipWindow))

In [21]:
numberOfBatches

271451

In [23]:
dataIndex = 0
for i in range(numberOfBatches):
    batch, labels = generateBatch(positiveSampleSize, skipWindow, kNegativeSamples)
    batch = np.asarray(batch, dtype = np.uint16)
    labels = np.asarray(labels, dtype = np.int8)
    trainBatch(batch, labels)
    print 'Batch {0} complete.'.format(i)

Batch 0 complete.
Batch 1 complete.
Batch 2 complete.
Batch 3 complete.
Batch 4 complete.
Batch 5 complete.
Batch 6 complete.
Batch 7 complete.
Batch 8 complete.
Batch 9 complete.
Batch 10 complete.
Batch 11 complete.
Batch 12 complete.
Batch 13 complete.
Batch 14 complete.
Batch 15 complete.
Batch 16 complete.
Batch 17 complete.
Batch 18 complete.
Batch 19 complete.
Batch 20 complete.
Batch 21 complete.
Batch 22 complete.
Batch 23 complete.
Batch 24 complete.
Batch 25 complete.
Batch 26 complete.
Batch 27 complete.
Batch 28 complete.
Batch 29 complete.
Batch 30 complete.
Batch 31 complete.
Batch 32 complete.
Batch 33 complete.
Batch 34 complete.
Batch 35 complete.
Batch 36 complete.
Batch 37 complete.
Batch 38 complete.
Batch 39 complete.
Batch 40 complete.
Batch 41 complete.
Batch 42 complete.
Batch 43 complete.
Batch 44 complete.
Batch 45 complete.
Batch 46 complete.
Batch 47 complete.
Batch 48 complete.
Batch 49 complete.
Batch 50 complete.
Batch 51 complete.
Batch 52 complete.
Bat

KeyboardInterrupt: 