In [6]:
import numpy as np
import random

from utils.gradcheck import gradcheck_numeric
from utils.utils import normalizeRows, softmax

import torch

from word2vec import *

%load_ext autoreload
%autoreload 2

So now it's time for more complicated `negative sampling`. That's the loss function that we're  going to train on our corpus. Code is going to be more complicated - we need to use vectorization. 

In [7]:
def set_seed(s):
    random.seed(s)
    np.random.seed(s)

## Negative sampling

In [8]:
# setup for training
dataset = type('dummy', (), {})()

def dummySampleTokenIdx():
    return random.randint(0, 4)

def getRandomContext(C):
    tokens = ["a", "b", "c", "d", "e"]
    return tokens[random.randint(0, 4)], \
           [tokens[random.randint(0, 4)] for i in range(2 * C)]

dataset.sampleTokenIdx = dummySampleTokenIdx
dataset.getRandomContext = getRandomContext

random.seed(31415)
np.random.seed(9265)
dummy_vectors = normalizeRows(np.random.randn(10, 3))
dummy_tokens = dict([("a", 0), ("b", 1), ("c", 2), ("d", 3), ("e", 4)])

In [9]:
dummy_vectors

array([[-0.96735714, -0.02182641,  0.25247529],
       [ 0.73663029, -0.48088687, -0.47552459],
       [-0.27323645,  0.12538062,  0.95374082],
       [-0.56713774, -0.27178229, -0.77748902],
       [-0.59609459,  0.7795666 ,  0.19221644],
       [-0.6831809 , -0.04200519,  0.72904007],
       [ 0.18289107,  0.76098587, -0.62245591],
       [-0.61517874,  0.5147624 , -0.59713884],
       [-0.33867074, -0.80966534, -0.47931635],
       [-0.52629529, -0.78190408,  0.33412466]])

In [10]:
def skipgram_v2(word2vecLossAndGradient=negSamplingLossAndGradient):
    set_seed(42)
    return skipgram(currentCenterWord="c", 
                    windowSize=1, 
                    outsideWords=["a", "b"],
                    word2Ind=dummy_tokens, 
                    centerWordVectors=dummy_vectors[:5, :], 
                    outsideVectors=dummy_vectors[5:, :], 
                    dataset=dataset,
                    word2vecLossAndGradient=word2vecLossAndGradient)

In [11]:
loss, gradCenterVecs, gradOutsideVectors = skipgram_v2()

In [12]:
loss

15.35112555003494

In [13]:
gradCenterVecs

array([[ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [-3.75386588, -3.18521139,  0.21341777],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ]])

In [14]:
gradOutsideVectors

array([[-0.30559455,  0.14022886,  1.06668785],
       [-0.32724523,  0.15016375,  1.14226025],
       [-0.22764219,  0.10445868,  0.79459256],
       [-0.42136814,  0.19335415,  1.47079939],
       [-1.12868414,  0.51792184,  3.93970919]])

### getNegativeSamples

In [15]:
set_seed(42)
negSampleWordIndices = getNegativeSamples(outsideWordIdx=0, 
                                          dataset=dataset, 
                                          K=10)

In [16]:
negSampleWordIndices

[2, 1, 1, 1, 4, 4, 3, 1, 1, 4]

How exactly do we generate those indicies? Well we have 3 arguments: `outsideWordIdx, dataset, K`. We iterate over `range(K)` and call `dataset.sampleTokenIdx()` that simply generate random number in `[0, 1, 2, 3, 4]`.

In actual training we use class `StanfordSentiment` that also has method `sampleTokenIdx`. It's analysis is clearly outside of scope of this assignment.

In [17]:
dataset.sampleTokenIdx()

4

### gradCenterVec

Let's start with theory but keep it simple here. To find gradient of the first term we need to use chain rule: `grad` of `log` multiply by `grad` of `sigmoid` and finally by `grad` of vector product. It's easy to see the result: $-\sigma (1-\sigma) / \sigma \cdot \partial(u^T_o v_c) / \partial v_c$. We may use the same trick for the second term as well.

$$J_{neg-sample}(v_c, o, U) = -\log(\sigma(u^T_o v_c)) - \sum_{k}\log(\sigma(-u^T_k v_c))$$

$$ \frac{\partial J_{neg-sample}}{\partial v_{c}} = (\sigma(u^T_o v_c)-1)u_o - \sum_{k}(\sigma(-u^T_k v_c)-1)u_k $$

In [18]:
def negSamplingLossAndGradient_v2(centerWordVec, outsideWordIdx, 
                               outsideVectors, dataset, K=10):
    
    negSampleWordIndices = getNegativeSamples(outsideWordIdx, dataset, K)
    indices = [outsideWordIdx] + negSampleWordIndices
    
    uo = outsideVectors[outsideWordIdx]
    vc = centerWordVec
    Uneg = outsideVectors[negSampleWordIndices, :]
    
    loss = -np.log(sigmoid(uo.dot(vc))) \
           - np.sum(np.log(sigmoid(-Uneg.dot(vc))))
    
    gradCenterVecs = (sigmoid(uo.dot(vc)) - 1) * uo \
                   - Uneg.T.dot(sigmoid(-Uneg.dot(vc)) - 1)
    
    return loss, gradCenterVecs, 0

In [19]:
loss, gradCenterVecs, gradOutsideVectors = \
    skipgram_v2(word2vecLossAndGradient=negSamplingLossAndGradient_v2)

In [20]:
loss

15.35112555003494

In [21]:
gradCenterVecs

array([[ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ],
       [-3.75386588, -3.18521139,  0.21341777],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ]])

#### vectorization of sum

Let's look in details what `Uneg.dot(vc)` is doing.

In [22]:
set_seed(42)
negSampleWordIndices = getNegativeSamples(outsideWordIdx=0, 
                                          dataset=dataset, 
                                          K=10)

In [23]:
negSampleWordIndices, len(negSampleWordIndices)

([2, 1, 1, 1, 4, 4, 3, 1, 1, 4], 10)

In [24]:
outsideVectors = dummy_vectors[5:, :]

In [25]:
# vectors u are rows of this matrix
outsideVectors.shape

(5, 3)

In [26]:
# rows of Uneg - vectors u with indicies in negSampleWordIndices
Uneg = outsideVectors[negSampleWordIndices, :]

In [27]:
Uneg.shape

(10, 3)

In [28]:
# this should be equal to outsideVectors[2]
Uneg[0]

array([-0.61517874,  0.5147624 , -0.59713884])

In [29]:
# and indeed it's equal
outsideVectors[2]

array([-0.61517874,  0.5147624 , -0.59713884])

In [30]:
centerWordVectors=dummy_vectors[:5, :]
centerWordVec = centerWordVectors[2]

In [31]:
centerWordVec

array([-0.27323645,  0.12538062,  0.95374082])

In [32]:
vc = centerWordVec

So let's compare dot products (that is used in our formulas) and matrix product that we use in code. As we may see result is the same. Why is that? Well that's just another form of matrix multiplication:

$$Uv = \begin{bmatrix} u^T_1\\u^T_2 \\ ... \end{bmatrix} v = \begin{bmatrix} u^T_1 v\\u^T_2 v \\ ... \end{bmatrix}$$

In [33]:
[Uneg[i, :].dot(vc) for i in range(Uneg.shape[0])]

[-0.33688520543359074,
 -0.5482212337396852,
 -0.5482212337396852,
 -0.5482212337396852,
 0.3644357565240586,
 0.3644357565240586,
 -0.46612272591787834,
 -0.5482212337396852,
 -0.5482212337396852,
 0.3644357565240586]

In [34]:
Uneg.dot(vc)

array([-0.33688521, -0.54822123, -0.54822123, -0.54822123,  0.36443576,
        0.36443576, -0.46612273, -0.54822123, -0.54822123,  0.36443576])

#### gradCenterVecs

$$ \frac{\partial J_{neg-sample}}{\partial v_{c}} = (\sigma(u^T_o v_c)-1)u_o - \sum_{k}(\sigma(-u^T_k v_c)-1)u_k $$

Here we have even more involved vectorization: `Uneg.T.dot(sigmoid(-Uneg.dot(vc)) - 1)`. Why is it true? Well we already saw that `Uneg.dot(vc)` is vector with components $u^T_k v_c$. 

We now need linear combination of *rows* of $U$ by some vector. We know that to get that we need to multiply transposed $U$ by this vector. So again we use just modification of matrix multiplication.

### gradOutsideVecs

Finally lets try to analyze `gradOutsideVecs`. They can't be vectorized completely due to repetitions in `negSampleWordIndices`.

$$ \frac{\partial J_{neg-sample}}{\partial u_{o}} = (\sigma(u^T_o v_c)-1) v_c $$

$$ \frac{\partial J_{neg-sample}}{\partial u_k} = -(\sigma(-u^T_k v_c)-1) v_c \ \ \ k=1, 2,...,K $$

The code below should be clear for now. To compute `gradOutsideVecs[o, :]` we don't need any vectorization at all. `Uneg.dot(vc)` is again the matrix of $u^T_k v_c$. And now we have a little trick: if $u_i = u_j$ we have to add those gradients to `gradOutsideVecs[k, :]` where `k == i == j`. To do this using vectorization we need one more trick: we iterate over `zip(range(K), negSampleWordIndices)` (not over only `negSampleWordIndices`).

In [53]:
def negSamplingLossAndGradient_v3(centerWordVec, outsideWordIdx, 
                               outsideVectors, dataset, K=10):
    
    negSampleWordIndices = getNegativeSamples(outsideWordIdx, dataset, K)
    indices = [outsideWordIdx] + negSampleWordIndices
    
    o = outsideWordIdx
    uo = outsideVectors[outsideWordIdx]
    vc = centerWordVec
    Uneg = outsideVectors[negSampleWordIndices, :]
    
    print(negSampleWordIndices)
    print(Uneg)
    
    gradOutsideVecs = np.zeros_like(outsideVectors)
    gradOutsideVecs[o, :] = (sigmoid(uo.dot(vc)) - 1) * vc
    s = -(sigmoid(-Uneg.dot(vc)) - 1)
    for i, k in zip(range(K), negSampleWordIndices):
        gradOutsideVecs[k, :] += s[i] * vc
    
    
    return loss, 0, gradOutsideVecs

In [58]:
_, _, gradOutsideVectors = \
    skipgram_v2(word2vecLossAndGradient=negSamplingLossAndGradient_v3)

[2, 1, 1, 1, 4, 4, 3, 1, 1, 4]
[[-0.61517874  0.5147624  -0.59713884]
 [ 0.18289107  0.76098587 -0.62245591]
 [ 0.18289107  0.76098587 -0.62245591]
 [ 0.18289107  0.76098587 -0.62245591]
 [-0.52629529 -0.78190408  0.33412466]
 [-0.52629529 -0.78190408  0.33412466]
 [-0.33867074 -0.80966534 -0.47931635]
 [ 0.18289107  0.76098587 -0.62245591]
 [ 0.18289107  0.76098587 -0.62245591]
 [-0.52629529 -0.78190408  0.33412466]]
[4, 0, 4, 4, 3, 3, 4, 2, 0, 3]
[[-0.52629529 -0.78190408  0.33412466]
 [-0.6831809  -0.04200519  0.72904007]
 [-0.52629529 -0.78190408  0.33412466]
 [-0.52629529 -0.78190408  0.33412466]
 [-0.33867074 -0.80966534 -0.47931635]
 [-0.33867074 -0.80966534 -0.47931635]
 [-0.52629529 -0.78190408  0.33412466]
 [-0.61517874  0.5147624  -0.59713884]
 [-0.6831809  -0.04200519  0.72904007]
 [-0.33867074 -0.80966534 -0.47931635]]


In [52]:
gradOutsideVectors

array([[-0.30559455,  0.14022886,  1.06668785],
       [-0.32724523,  0.15016375,  1.14226025],
       [-0.22764219,  0.10445868,  0.79459256],
       [-0.42136814,  0.19335415,  1.47079939],
       [-1.12868414,  0.51792184,  3.93970919]])

This is the end of assignment 2.