In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
from tqdm import tqdm_notebook

## Initialization

In [2]:
def initialize_wrd_emb(vocab_size, emb_size):
    WRD_EMB = np.random.randn(vocab_size, emb_size) * 0.1
    return WRD_EMB

def initialize_dense(input_size, output_size):
    W = np.random.randn(output_size, input_size) * 0.1
    b = np.random.randn(output_size, 1) * 0.1
    return W, b

def initialize_parameters(vocab_size, emb_size):
    WRD_EMB = initialize_wrd_emb(vocab_size, emb_size)
    W, b = initialize_dense(emb_size, vocab_size)
    
    parameters = {}
    parameters['WRD_EMB'] = WRD_EMB
    parameters['W'] = W
    parameters['b'] = b
    
    return parameters

## Forward Propagation

In [3]:
def ind_to_word_vecs(inds, parameters):
    """
    inds -- shape: (CBOW_N, number of examples)
    """
    WRD_EMB = parameters['WRD_EMB']
    word_vecs = np.take(WRD_EMB, inds, axis=0)
    word_vecs = word_vecs.reshape(WRD_EMB.shape[1], inds.shape[0], -1)
    
    assert(word_vecs.shape == (WRD_EMB.shape[1], inds.shape[0], inds.shape[1]))
    
    return word_vecs

def mean_(word_vecs):
    word_vecs_mean = np.mean(word_vecs, axis=1)
    word_vecs_mean = word_vecs_mean.reshape(word_vecs.shape[0], -1)
    
    assert(word_vecs_mean.shape == (word_vecs.shape[0], word_vecs.shape[2]))
    
    return word_vecs_mean

def linear_dense(word_vecs_mean, parameters):
    W, b = parameters['W'], parameters['b']
    Z = np.dot(W, word_vecs_mean) + b
    
    assert(Z.shape == (W.shape[0], word_vecs_mean.shape[1]))
    
    return W, b, Z

def softmax(Z):
    softmax_out = np.divide(np.exp(Z), np.sum(np.exp(Z), axis=1, keepdims=True) + 0.001)
    
    assert(softmax_out.shape == Z.shape)

    return softmax_out

def forward_propagation(inds, parameters):
    word_vecs = ind_to_word_vecs(inds, parameters)
    word_vecs_mean = mean_(word_vecs)
    W, b, Z = linear_dense(word_vecs_mean, parameters)
    softmax_out = softmax(Z)
    
    caches = {}
    caches['inds'] = inds
    caches['word_vecs'] = word_vecs
    caches['word_vecs_mean'] = word_vecs_mean
    caches['W'] = W
    caches['b'] = b
    caches['Z'] = Z
    
    return softmax_out, caches

## Cost Function

In [4]:
def cross_entropy(softmax_out, Y):
    m = softmax_out.shape[1]
    cost = -(1 / m) * np.sum(np.sum(Y * np.log(softmax_out + 0.001), axis=1), axis=0)
    return cost

## Backward Propagation

In [5]:
def softmax_backward(Y, caches):
    Z = caches['Z']
    dL_dZ = Z - Y
    
    assert(dL_dZ.shape == Z.shape)
    
    return dL_dZ

def dense_backward(dL_dZ, caches):
    W = caches['W']
    b = caches['b']
    word_vecs_mean = caches['word_vecs_mean']
    m = word_vecs_mean.shape[1]
    
    dL_dW = (1 / m) * np.dot(dL_dZ, word_vecs_mean.T)
    dL_db = (1 / m) * np.mean(dL_dZ, axis=1, keepdims=True)
    dL_dword_vecs_mean = np.dot(W.T, dL_dZ)

    assert(W.shape == dL_dW.shape)
    assert(b.shape == dL_db.shape)
    assert(word_vecs_mean.shape == dL_dword_vecs_mean.shape)
    
    return dL_dW, dL_db, dL_dword_vecs_mean

def mean_backward(dL_dword_vecs_mean, caches):
    word_vecs = caches['word_vecs']
    CBOW_N = word_vecs.shape[1]
    
    dL_dword_vecs = (1 / m) * (1 / CBOW_N) * np.ones((dL_dword_vecs_mean.shape[0], CBOW_N)) *\
        np.sum(dL_dword_vecs_mean, axis=1, keepdims=True)

    assert((word_vecs.shape[0], word_vecs.shape[1]) == dL_dword_vecs.shape[:2])
    
    return dL_dword_vecs

def backward_propagation(Y, caches):
    dL_dZ = softmax_backward(Y, caches)
    dL_dW, dL_db, dL_dword_vecs_mean = dense_backward(dL_dZ, caches)
    dL_dword_vecs = mean_backward(dL_dword_vecs_mean, caches)
    
    gradients = dict()
    gradients['dL_dZ'] = dL_dZ
    gradients['dL_dW'] = dL_dW
    gradients['dL_db'] = dL_db
    gradients['dL_dword_vecs'] = dL_dword_vecs
    
    return gradients

def update_parameters(parameters, caches, gradients, learning_rate):
    CBOW_N = caches['inds'].shape[0]
    vocab_size, emb_size = parameters['WRD_EMB'].shape
    
    inds = caches['inds']
    updated_WRD_EMD = parameters['WRD_EMB'][inds.T, :] -\
        learning_rate * gradients['dL_dword_vecs'].T.reshape(1, CBOW_N, -1)
    parameters['WRD_EMB'][inds.flatten(), :] = updated_WRD_EMD.reshape(-1, emb_size)
    parameters['W'] -= learning_rate * gradients['dL_dW']
    parameters['b'] -= learning_rate * gradients['dL_db']
    

In [10]:
def cbow_model(X, Y, vocab_size, emb_size, learning_rate, epochs, batch_size=256, parameters=None, print_cost=False):
    costs = []
    m = X.shape[1]
    if parameters is None:
        parameters = initialize_parameters(vocab_size, emb_size)

    batch_inds = list(range(0, m, batch_size))
    for epoch in tqdm_notebook(range(epochs)):
        np.random.shuffle(batch_inds)
        for i in tqdm_notebook(batch_inds[:1000]):
            X_batch = X[:, i:i+batch_size]
            Y_batch = Y[:, i:i+batch_size]

            softmax_out, caches = forward_propagation(X_batch, parameters)
            gradients = backward_propagation(Y_batch, caches)
            update_parameters(parameters, caches, gradients, learning_rate)

        cost = cross_entropy(softmax_out, Y_batch)
        costs.append(cost)
        if print_cost and epoch % 25 == 0:
            print("Cost after epoch {}: {}".format(epoch, np.squeeze(cost)))
        
    return parameters

### Toy data
Sentence: I(0) would(1) like(2) to(3) get(4) a(5) better(6) job(7).  
vocab_size = 8  
```
[0, 2] [1]  
[1, 3] [2]  
[2, 4] [3]  
[3, 5] [4]  
[4, 6] [5]  
[5, 7] [6]
```

In [7]:
# input_len = 2
# vocab_size = 8
# m = 6
# emb_size = 15
# X = np.array([[0, 1, 2, 3, 4, 5],
#               [2, 3, 4, 5, 6, 7]]) # 2 x 6
# Y = np.array([1, 2, 3, 4, 5, 6]) # 1 x 6
# Y_one_hot = np.zeros((vocab_size, m))  # 8 x 6
# Y_one_hot[Y.flatten(), np.arange(6)] = 1

### Initialization Test

In [8]:
# parameters = initialize_parameters(vocab_size, emb_size)

### Forward Probagation Test

In [9]:
# softmax_out, caches = forward_propagation(X, parameters)

### Compute Cost Test

In [10]:
# cost = cross_entropy(softmax_out, Y_one_hot)

### Backward Probagation Test

In [11]:
# gradients = backward_propagation(Y_one_hot, caches)

### Model Test

In [12]:
# parameters = cbow_model(X, Y_one_hot, vocab_size, emb_size, 0.005, 1000000, print_cost=True)

In [13]:
# parameters = cbow_model(X, Y_one_hot, vocab_size, emb_size, 0.003, 1000000, parameters=parameters, print_cost=True)

In [14]:
# parameters = cbow_model(X, Y_one_hot, vocab_size, emb_size, 0.004, 1000000, parameters=parameters, print_cost=True)

## Stack Overflow data

In [11]:
import pickle

with open('pickles/X.pkl', 'rb') as file:
    X = pickle.load(file)
    
with open('pickles/Y.pkl', 'rb') as file:
    Y = pickle.load(file)
    
assert(X.shape[-1] == Y.shape[-1])

In [12]:
vocab_size = 31853
m = X.shape[-1]
batch_size = 256
emb_size = 50

Y_one_hot = np.zeros((vocab_size, m))
Y_one_hot[Y.flatten(), np.arange(m)] = 1

In [None]:
parameters = cbow_model(X, Y_one_hot, vocab_size, emb_size, 0.005, 2000, batch_size=128, print_cost=True)

HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

In [None]:
WRD_EMB = parameters['WRD_EMB']

# with open('pickles/word_to_id.pkl', 'rb') as file:
#     word_to_id = pickle.load(file)
    
# with open('pickles/id_to_word.pkl', 'rb') as file:
#     id_to_word = pickle.load(file)

In [None]:
def find_top_n_similar(word, wrd_emb, n=10):
    id_ = word_to_id[word]
    vec_word = wrd_emb[id_, :]
    norm_vec_word = np.linalg.norm(vec_word)
    cos_sim = np.dot(wrd_emb, vec_word.T) / (np.linalg.norm(wrd_emb, axis=1) * norm_vec_word)
    top_n_ind = np.argsort(cos_sim)[-n:][::-1]
    return top_n_ind

In [None]:
inds = find_top_n_similar('sort', WRD_EMB, 20)

In [None]:
[id_to_word[id_] for id_ in inds]

In [None]:
id_to_word[156]

In [None]:
word_to_id['bfs']