In Ipython Notebook, I can write down the mathmatical expression with latex, which allows me to understand my codes better.

## q_3 word2vec.py

In [4]:
import numpy as np
import random

from q1_softmax import softmax
from q2_gradcheck import gradcheck_naive
from q2_sigmoid import sigmoid, sigmoid_grad

In [5]:
def normalizeRows(x):
    """ 
    Row normalization function

    Implement a function that normalizes each row of a matrix to have unit length.
    """

    ### YOUR CODE HERE
#    print (x.sum(axis=1).reshape(-1,1))
    x = x/np.sqrt((x**2).sum(axis=1)).reshape(-1,1)
#   Equivalent Form:
    '''
    x = x/np.sqrt((x**2).sum(axis-=1, keepdims = True))

    '''
    #raise NotImplementedError
    ### END YOUR CODE

    return x

In [6]:
def test_normalize_rows():
    print ("Testing normalizeRows...")
    x = normalizeRows(np.array([[3.0,4.0],[1, 2]]))
    print (x)
    ans = np.array([[0.6,0.8],[0.4472136,0.89442719]])
    assert np.allclose(x, ans, rtol=1e-05, atol=1e-06)
    print ("test passed")
test_normalize_rows()

Testing normalizeRows...
[[0.6        0.8       ]
 [0.4472136  0.89442719]]
test passed


## For the input arguments of the softmaxCostAndGradient function
- ($\hat{y}$) = predicted
- ($\hat{y} - y$) = (predicted[target] -= 1.)
- cost = -log(prob)
- gradPred = $\frac{\partial CE(y, \hat{y})}{\partial \theta}$ = $U (\hat{y} - y)$ = np.dot(prob, $\hat{y} - y$)
- grad = $\frac{\partial CE(y, \hat{y})}{\partial u_w}$

In [11]:
def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    Arguments:
    predicted -- numpy ndarray, predicted word vector 
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
            what is the meaning of the output vectors?
    dataset -- needed for negative sampling, unused here.

    Return:
    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
           vector
    grad -- the gradient with respect to all the other word
           vectors

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    assignment!
    """


#The math expression of the loss function can be found in the slides,
# to get a better understanding, I will use the same notation same as the paper assignment

    ### YOUR CODE HERE
    # y has the same shape with y_hat but all zero values, 
    # whereas the target place has a value of 1.

    #然后按照slides上的表达式 就直接求出cost
    prob = softmax(np.dot(predicted, outputVectors.T))
    cost = -np.log(prob[target])

    #这一步是用来求出 y_hat - y
    prob[target] -= 1.

    #跟推导的结果一致，
    gradPred = np.dot(prob, outputVectors)
    
    #这里我不是很清楚为什么要这么来写,这三种表达方式等价，我用的是我比较熟悉的一种
    #grad = prob[:, np.newaxis] * predicted[np.newaxis, :]
    #grad = np.outer(prob, predicted)
    grad = np.dot(prob.reshape(-1,1), predicted.reshape(1, -1))
    
    #raise NotImplementedError
    ### END YOUR CODE

    return cost, gradPred, grad

'np.out(a,b)' is to combine the a(M, ) and b(N, ) into (M, N) array, where out[i][j] = a[i] * b[j]



In [8]:
def getNegativeSamples(target, dataset, K):
    """ Samples K indexes which are not the target """
    indices = [None] * K
    for k in range(K):
        newidx = dataset.sampleTokenIdx()
        while newidx == target:
            newidx = dataset.sampleTokenIdx()
        indices[k] = newidx
    return indices

## This part is designed to execute the part(c) of the assignment problem

$J_{loss}$ = $-log(\sigma(u_O^T v_C)) - \Sigma_{k=1}^K log(\sigma(-u_k^T v_C))$

$\frac{\partial J_{loss}}{\partial v_c}$ = $(\sigma(u_O^T v_C)-1)u_O - \Sigma_{k=1}^K (\sigma(-u_k^T v_C)-1)u_k$

$\frac{\partial J_{loss}}{\partial u_O}$ = $[\sigma(u_O^T v_C) - 1]v_C$

$\frac{\partial J_{loss}}{\partial u_k}$ = $-[\sigma(-u_k^T v_C) - 1]v_C$

In [10]:
def negSamplingCostAndGradient(predicted, target, outputVectors, dataset, K=10):
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample
    size.

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient
    """

    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!
    indices = [target]
    indices.extend(getNegativeSamples(target, dataset, K))

   
    ### YOUR CODE HERE
    ### these parameters can be derived directly from my paper assignment
    prob =  np.dot(outputVectors, predicted)
    cost = -np.log(sigmoid(prob[target])) - np.log(sigmoid(-prob[indices[1:]])).sum()


    opp_sig = (sigmoid(-prob[indices[1:]]) - 1)
    gradPred = (sigmoid(prob[target]) - 1) * outputVectors[target] \
            +  sum(opp_sig[:, np.newaxis] * outputVectors[indices[1:]])
    
    grad = np.zeros_like(outputVectors)
    grad[target] = (sigmoid(prob[target]) - 1) * predicted

    for k in indices[1:]:
        grad[k] += (1.0 - sigmoid(-np.dot(outputVectors[k], predicted))) * predicted

    #raise NotImplementedError
    ### END YOUR CODE
    

    return cost, gradPred, grad