In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np

## Initialization

In [2]:
def initialize_wrd_emb(vocab_size, emb_size):
    WRD_EMB = np.random.uniform(size=(vocab_size, emb_size))
    return WRD_EMB

def initialize_dense(input_size, output_size):
    W = np.random.uniform(size=(output_size, input_size))
    b = np.random.uniform(size=(output_size, 1))
    return W, b

def initialize_parameters(vocab_size, emb_size):
    WRD_EMB = initialize_wrd_emb(vocab_size, emb_size)
    W, b = initialize_dense(emb_size, vocab_size)
    
    parameters = {}
    parameters['WRD_EMB'] = WRD_EMB
    parameters['W'] = W
    parameters['b'] = b
    
    return parameters

## Forward Propagation

In [3]:
def ind_to_word_vecs(inds, parameters):
    """
    inds -- shape: (CBOW_N, number of examples)
    """
    WRD_EMB = parameters['WRD_EMB']
    word_vecs = np.take(WRD_EMB, inds, axis=0)
    word_vecs = word_vecs.reshape(WRD_EMB.shape[1], inds.shape[0], -1)
    
    assert(word_vecs.shape == (WRD_EMB.shape[1], inds.shape[0], inds.shape[1]))
    
    return word_vecs

def sum_(word_vecs):
    word_vecs_sum = np.sum(word_vecs, axis=1)
    word_vecs_sum = word_vecs_sum.reshape(word_vecs.shape[0], -1)
    
    assert(word_vecs_sum.shape == (word_vecs.shape[0], word_vecs.shape[2]))
    
    return word_vecs_sum

def linear_dense(word_vecs_sum, parameters):
    W, b = parameters['W'], parameters['b']
    Z = np.dot(W, word_vecs_sum) + b
    
    assert(Z.shape == (W.shape[0], word_vecs_sum.shape[1]))
    
    return W, b, Z

def softmax(Z):
    softmax_out = np.divide(np.exp(Z), np.sum(np.exp(Z), axis=0))
    
    assert(softmax_out.shape == Z.shape)
    
    return softmax_out

def forward_propagation(inds, parameters):
    word_vecs = ind_to_word_vecs(inds, parameters)
    word_vecs_sum = sum_(word_vecs)
    W, b, Z = linear_dense(word_vecs_sum, parameters)
    softmax_out = softmax(Z)
    
    caches = {}
    caches['inds'] = inds
    caches['word_vecs'] = word_vecs
    caches['word_vecs_sum'] = word_vecs_sum
    caches['W'] = W
    caches['b'] = b
    caches['Z'] = Z
    
    return softmax_out, caches

## Cost Function

In [4]:
def cross_entropy(softmax_out, Y):
    m = softmax_out.shape[1]
    cost = -(1 / m) * np.sum(np.sum(Y * np.log(softmax_out), axis=1), axis=0)
    return cost

## Backward Propagation

In [5]:
def softmax_backward(Y, caches):
    Z = caches['Z']
    dL_dZ = Z - Y
    
    assert(dL_dZ.shape == Z.shape)
    
    return dL_dZ

def dense_backward(dL_dZ, caches):
    W = caches['W']
    b = caches['b']
    word_vecs_sum = caches['word_vecs_sum']
    m = word_vecs_sum.shape[1]
    
    dL_dW = (1 / m) * np.dot(dL_dZ, word_vecs_sum.T)
    dL_db = (1 / m) * np.sum(dL_dZ, axis=1, keepdims=True)
    dL_dword_vecs_sum = np.dot(W.T, dL_dZ)

    assert(W.shape == dL_dW.shape)
    assert(b.shape == dL_db.shape)
    assert(word_vecs_sum.shape == dL_dword_vecs_sum.shape)
    
    return dL_dW, dL_db, dL_dword_vecs_sum

def sum_backward(dL_dword_vecs_sum, caches):
    word_vecs = caches['word_vecs']
    CBOW_N = word_vecs.shape[1]
    
    dL_dword_vecs = (1 / m) * np.ones((dL_dword_vecs_sum.shape[0], CBOW_N)) *\
        np.sum(dL_dword_vecs_sum, axis=1, keepdims=True)

    assert((word_vecs.shape[0], word_vecs.shape[1]) == dL_dword_vecs.shape[:2])
    
    return dL_dword_vecs

def backward_propagation(Y, caches):
    dL_dz = softmax_backward(Y, caches)
    dL_dW, dL_db, dL_dword_vecs_sum = dense_backward(dL_dz, caches)
    dL_dword_vecs = sum_backward(dL_dword_vecs_sum, caches)
    
    gradients = dict()
    gradients['dL_dW'] = dL_dW
    gradients['dL_db'] = dL_db
    gradients['dL_dword_vecs'] = dL_dword_vecs
    
    return gradients

def update_parameters(parameters, caches, gradients, learning_rate):
    CBOW_N = caches['inds'].shape[0]
    vocab_size, emb_size = parameters['WRD_EMB'].shape
    
    inds = caches['inds']
    updated_WRD_EMD = parameters['WRD_EMB'][inds.T, :] -\
        learning_rate * gradients['dL_dword_vecs'].T.reshape(1, CBOW_N, -1)
    parameters['WRD_EMB'][inds.flatten(), :] = updated_WRD_EMD.reshape(-1, emb_size)
    parameters['W'] -= learning_rate * gradients['dL_dW']
    parameters['b'] -= learning_rate * gradients['dL_db']
    

In [14]:
def cbow_model(X, Y, vocab_size, emb_size, learning_rate, epochs, parameters=None, print_cost=False):
    costs = []
    if parameters is None:
        parameters = initialize_parameters(vocab_size, emb_size)
    
    for i in range(epochs):
        softmax_out, caches = forward_propagation(X, parameters)
        cost = cross_entropy(softmax_out, Y)
        gradients = backward_propagation(Y, caches)
        update_parameters(parameters, caches, gradients, learning_rate)
        
        costs.append(cost)
        if print_cost and i % 10000 == 0:
            print("Cost after iterations {}: {}".format(i, np.squeeze(cost)))
        
    return parameters

### Toy data
Sentence: I(0) would(1) like(2) to(3) get(4) a(5) better(6) job(7).  
vocab_size = 8  
```
[0, 2] [1]  
[1, 3] [2]  
[2, 4] [3]  
[3, 5] [4]  
[4, 6] [5]  
[5, 7] [6]
```

In [15]:
input_len = 2
vocab_size = 8
m = 6
emb_size = 15
X = np.array([[0, 1, 2, 3, 4, 5],
              [2, 3, 4, 5, 6, 7]]) # 2 x 6
Y = np.array([1, 2, 3, 4, 5, 6]) # 1 x 6
Y_one_hot = np.zeros((vocab_size, m))  # 8 x 6
Y_one_hot[Y.flatten(), np.arange(6)] = 1

### Initialization Test

In [16]:
parameters = initialize_parameters(vocab_size, emb_size)

### Forward Probagation Test

In [17]:
softmax_out, caches = forward_propagation(X, parameters)

### Compute Cost Test

In [18]:
cost = cross_entropy(softmax_out, Y_one_hot)

### Backward Probagation Test

In [19]:
gradients = backward_propagation(Y_one_hot, caches)

### Model Test

In [24]:
parameters = cbow_model(X, Y_one_hot, vocab_size, emb_size, 0.005, 1000000, print_cost=True)

Cost after iterations 0: 2.4290657988060467
Cost after iterations 10000: 1.3812163351456015
Cost after iterations 20000: 1.35663956908022
Cost after iterations 30000: 1.344882725596848
Cost after iterations 40000: 1.3381822336402778
Cost after iterations 50000: 1.3337763353864123
Cost after iterations 60000: 1.3305434442142525
Cost after iterations 70000: 1.3279765663906762
Cost after iterations 80000: 1.3258173503540844
Cost after iterations 90000: 1.3239222294569237
Cost after iterations 100000: 1.322207007992102
Cost after iterations 110000: 1.320620596264673
Cost after iterations 120000: 1.3191312102651627
Cost after iterations 130000: 1.3177186338835392
Cost after iterations 140000: 1.3163697090703583
Cost after iterations 150000: 1.3150756435269835
Cost after iterations 160000: 1.3138303772572137
Cost after iterations 170000: 1.3126295796847738
Cost after iterations 180000: 1.3114700280534035
Cost after iterations 190000: 1.310349219012241
Cost after iterations 200000: 1.30926512

In [26]:
parameters = cbow_model(X, Y_one_hot, vocab_size, emb_size, 0.003, 1000000, parameters=parameters, print_cost=True)

Cost after iterations 0: 1.2771419272935438
Cost after iterations 10000: 1.2770941164666594
Cost after iterations 20000: 1.2770404331081637
Cost after iterations 30000: 1.276989081257757
Cost after iterations 40000: 1.2769388309767298
Cost after iterations 50000: 1.2768895546318006
Cost after iterations 60000: 1.2768412134762057
Cost after iterations 70000: 1.276793771659594
Cost after iterations 80000: 1.2767471946989106
Cost after iterations 90000: 1.276701451590308
Cost after iterations 100000: 1.2766565152104081
Cost after iterations 110000: 1.276612361885264
Cost after iterations 120000: 1.2765689707628818
Cost after iterations 130000: 1.2765263232265562
Cost after iterations 140000: 1.2764844024105098
Cost after iterations 150000: 1.2764431928192592
Cost after iterations 160000: 1.2764026800346553
Cost after iterations 170000: 1.276362850493022
Cost after iterations 180000: 1.2763236913163576
Cost after iterations 190000: 1.2762851901850338
Cost after iterations 200000: 1.2762473

In [27]:
parameters = cbow_model(X, Y_one_hot, vocab_size, emb_size, 0.004, 1000000, parameters=parameters, print_cost=True)

Cost after iterations 0: 1.2745963507750484
Cost after iterations 10000: 1.2745836509003459
Cost after iterations 20000: 1.2745707229444228
Cost after iterations 30000: 1.2745581899393552
Cost after iterations 40000: 1.2745459725070305
Cost after iterations 50000: 1.2745340528815519
Cost after iterations 60000: 1.274522416475609
Cost after iterations 70000: 1.274511050850203
Cost after iterations 80000: 1.274499945617501
Cost after iterations 90000: 1.2744890919925695
Cost after iterations 100000: 1.2744784823582793
Cost after iterations 110000: 1.2744681099349349
Cost after iterations 120000: 1.274457968546614
Cost after iterations 130000: 1.2744480524600938
Cost after iterations 140000: 1.2744383562746615
Cost after iterations 150000: 1.2744288748470085
Cost after iterations 160000: 1.2744196032399444
Cost after iterations 170000: 1.274410536687307
Cost after iterations 180000: 1.2744016705696382
Cost after iterations 190000: 1.2743930003973312
Cost after iterations 200000: 1.2743845

In [29]:
wrd_emb = parameters['WRD_EMB']

for i in range(8):
    for j in range(i + 1, 8):
        vec_i, vec_j = wrd_emb[i, :], wrd_emb[j, :]
        cos_sim = np.dot(vec_i, vec_j.T) / (np.linalg.norm(vec_i) * np.linalg.norm(vec_j))
        print('{}, {}: {}'.format(i, j, cos_sim))

0, 1: 0.4274763661178869
0, 2: 0.2705866913023904
0, 3: 0.4274763661178869
0, 4: -0.09015680723840407
0, 5: 0.2705866913023904
0, 6: 0.4274763661178869
0, 7: 0.4320663746882391
1, 2: 0.2795420242392431
1, 3: 1.0000000000000002
1, 4: 0.35118295197677085
1, 5: 0.27954202423924307
1, 6: 1.0
1, 7: 0.43450295360836166
2, 3: 0.27954202423924296
2, 4: -0.01701895476734841
2, 5: 1.0000000000000002
2, 6: 0.27954202423924307
2, 7: 0.5477257188544188
3, 4: 0.3511829519767708
3, 5: 0.279542024239243
3, 6: 0.9999999999999999
3, 7: 0.4345029536083616
4, 5: -0.017018954767348413
4, 6: 0.3511829519767708
4, 7: 0.1656207032022741
5, 6: 0.27954202423924307
5, 7: 0.5477257188544188
6, 7: 0.4345029536083616


In [30]:
wrd_emb

array([[ 0.28388168, -0.09018733, -0.1182425 , -0.6984883 , -0.34360891,
         0.47949579,  0.02677859, -0.29275589,  0.13519994,  0.13533125,
        -0.52338551, -0.32271853,  0.04203143, -0.07006129,  0.42923333],
       [ 0.51724136, -0.35558248, -0.17801584, -0.62767221, -0.092183  ,
         0.09996036,  0.18650278,  0.42500462, -0.20902071, -0.4160677 ,
        -0.44719754,  0.32435583,  0.15416826,  0.06118605,  0.1125002 ],
       [ 0.36451148,  0.16817632, -0.6901139 , -0.61564657, -0.07333198,
        -0.14828185,  0.59707969,  0.45132495,  0.49851355,  0.15252522,
         0.27534255, -0.19337081, -0.50989304,  0.39624888,  0.36182102],
       [ 0.51724136, -0.35558248, -0.17801584, -0.62767221, -0.092183  ,
         0.09996036,  0.18650278,  0.42500462, -0.20902071, -0.4160677 ,
        -0.44719754,  0.32435583,  0.15416826,  0.06118605,  0.1125002 ],
       [ 0.00631667, -0.72975983, -0.5577656 , -0.08515884, -0.23903792,
         0.1744345 , -0.02969118, -0.21422726, 