In [13]:
import numpy as np

In [5]:
def tokenize(text):
    # obtains tokens with a least 1 alphabet
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    return pattern.findall(text.lower())
def mapping(tokens):
    word_to_id = dict()
    id_to_word = dict()

    for i, token in enumerate(set(tokens)):
        word_to_id[token] = i
        id_to_word[i] = token

    return word_to_id, id_to_word
def generate_training_data(tokens, word_to_id, window_size):
    N = len(tokens)
    X, Y = [], []

    for i in range(N):
        nbr_inds = list(range(max(0, i - window_size), i)) + \
                   list(range(i + 1, min(N, i + window_size + 1)))
        for j in nbr_inds:
            X.append(word_to_id[tokens[i]])
            Y.append(word_to_id[tokens[j]])
            
    X = np.array(X)
    X = np.expand_dims(X, axis=0)
    Y = np.array(Y)
    Y = np.expand_dims(Y, axis=0)

    return X, Y

In [6]:
doc = "After the deduction of the costs of investing, " \
      "beating the stock market is a loser's game."

In [55]:
doc="V1 V2 V3 V1 V2 V3 V1 V2 V3 V4"

In [56]:
tokens = tokenize(doc)

In [57]:
tokens

['v1', 'v2', 'v3', 'v1', 'v2', 'v3', 'v1', 'v2', 'v3', 'v4']

In [58]:
word_to_id, id_to_word = mapping(tokens)

In [59]:
word_to_id

{'v2': 0, 'v4': 1, 'v1': 2, 'v3': 3}

In [60]:
id_to_word 

{0: 'v2', 1: 'v4', 2: 'v1', 3: 'v3'}

In [63]:
X, Y = generate_training_data(tokens, word_to_id, 2)

In [64]:
X

array([[2, 2, 0, 0, 0, 3, 3, 3, 3, 2, 2, 2, 2, 0, 0, 0, 0, 3, 3, 3, 3, 2,
        2, 2, 2, 0, 0, 0, 0, 3, 3, 3, 1, 1]])

In [65]:
Y

array([[0, 3, 2, 3, 2, 2, 0, 2, 0, 0, 3, 0, 3, 3, 2, 3, 2, 2, 0, 2, 0, 0,
        3, 0, 3, 3, 2, 3, 1, 2, 0, 1, 0, 3]])

In [66]:
X.shape

(1, 34)

In [67]:
vocab_size = len(id_to_word)

In [68]:
vocab_size

4

In [69]:
m = Y.shape[1]

In [70]:
m

34

In [71]:
Y.shape

(1, 34)

In [72]:
Y_one_hot = np.zeros((vocab_size, m))
Y_one_hot[Y.flatten(), np.arange(m)] = 1

In [73]:
Y_one_hot

array([[1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0.,
        0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
        1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
        0., 0.],
       [0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.,
        1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0.,
        0., 0.],
       [0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1.,
        0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0.,
        0., 1.]])

In [74]:
Y_one_hot[:,1]

array([0., 0., 0., 1.])

In [37]:
def initialize_wrd_emb(vocab_size, emb_size):
    """
    vocab_size: int. vocabulary size of your corpus or training data
    emb_size: int. word embedding size. How many dimensions to represent each vocabulary
    """
    WRD_EMB = np.random.randn(vocab_size, emb_size) * 0.01
    return WRD_EMB

def initialize_dense(input_size, output_size):
    """
    input_size: int. size of the input to the dense layer
    output_szie: int. size of the output out of the dense layer
    """
    W = np.random.randn(output_size, input_size) * 0.01
    return W

def initialize_parameters(vocab_size, emb_size):
    """
    initialize all the trianing parameters
    """
    WRD_EMB = initialize_wrd_emb(vocab_size, emb_size)
    W = initialize_dense(emb_size, vocab_size)
    
    parameters = {}
    parameters['WRD_EMB'] = WRD_EMB
    parameters['W'] = W
    
    return parameters

In [38]:
def ind_to_word_vecs(inds, parameters):
    """
    inds: numpy array. shape: (1, m)
    parameters: dict. weights to be trained
    """
    m = inds.shape[1]
    WRD_EMB = parameters['WRD_EMB']
    word_vec = WRD_EMB[inds.flatten(), :].T
    
    assert(word_vec.shape == (WRD_EMB.shape[1], m))
    
    return word_vec

def linear_dense(word_vec, parameters):
    """
    word_vec: numpy array. shape: (emb_size, m)
    parameters: dict. weights to be trained
    """
    m = word_vec.shape[1]
    W = parameters['W']
    Z = np.dot(W, word_vec)
    
    assert(Z.shape == (W.shape[0], m))
    
    return W, Z

def softmax(Z):
    """
    Z: output out of the dense layer. shape: (vocab_size, m)
    """
    softmax_out = np.divide(np.exp(Z), np.sum(np.exp(Z), axis=0, keepdims=True) + 0.001)
    
    assert(softmax_out.shape == Z.shape)

    return softmax_out

def forward_propagation(inds, parameters):
    word_vec = ind_to_word_vecs(inds, parameters)
    W, Z = linear_dense(word_vec, parameters)
    softmax_out = softmax(Z)
    
    caches = {}
    caches['inds'] = inds
    caches['word_vec'] = word_vec
    caches['W'] = W
    caches['Z'] = Z
    
    return softmax_out, caches

In [39]:
def cross_entropy(softmax_out, Y):
    """
    softmax_out: output out of softmax. shape: (vocab_size, m)
    """
    m = softmax_out.shape[1]
    cost = -(1 / m) * np.sum(np.sum(Y * np.log(softmax_out + 0.001), axis=0, keepdims=True), axis=1)
    return cost

In [40]:
def softmax_backward(Y, softmax_out):
    """
    Y: labels of training data. shape: (vocab_size, m)
    softmax_out: output out of softmax. shape: (vocab_size, m)
    """
    dL_dZ = softmax_out - Y
    
    assert(dL_dZ.shape == softmax_out.shape)
    return dL_dZ

def dense_backward(dL_dZ, caches):
    """
    dL_dZ: shape: (vocab_size, m)
    caches: dict. results from each steps of forward propagation
    """
    W = caches['W']
    word_vec = caches['word_vec']
    m = word_vec.shape[1]
    
    dL_dW = (1 / m) * np.dot(dL_dZ, word_vec.T)
    dL_dword_vec = np.dot(W.T, dL_dZ)

    assert(W.shape == dL_dW.shape)
    assert(word_vec.shape == dL_dword_vec.shape)
    
    return dL_dW, dL_dword_vec

def backward_propagation(Y, softmax_out, caches):
    dL_dZ = softmax_backward(Y, softmax_out)
    dL_dW, dL_dword_vec = dense_backward(dL_dZ, caches)
    
    gradients = dict()
    gradients['dL_dZ'] = dL_dZ
    gradients['dL_dW'] = dL_dW
    gradients['dL_dword_vec'] = dL_dword_vec
    
    return gradients

def update_parameters(parameters, caches, gradients, learning_rate):
    vocab_size, emb_size = parameters['WRD_EMB'].shape
    inds = caches['inds']
    WRD_EMB = parameters['WRD_EMB']
    dL_dword_vec = gradients['dL_dword_vec']
    m = inds.shape[-1]
    
    WRD_EMB[inds.flatten(), :] -= dL_dword_vec.T * learning_rate

    parameters['W'] -= learning_rate * gradients['dL_dW']

In [52]:
def skipgram_model_training(X, Y, vocab_size, emb_size, learning_rate, epochs, batch_size=256, parameters=None, print_cost=True, plot_cost=True):
    """
    X: Input word indices. shape: (1, m)
    Y: One-hot encodeing of output word indices. shape: (vocab_size, m)
    vocab_size: vocabulary size of your corpus or training data
    emb_size: word embedding size. How many dimensions to represent each vocabulary
    learning_rate: alaph in the weight update formula
    epochs: how many epochs to train the model
    batch_size: size of mini batch
    parameters: pre-trained or pre-initialized parameters
    print_cost: whether or not to print costs during the training process
    """
    costs = []
    m = X.shape[1]
    
    if parameters is None:
        parameters = initialize_parameters(vocab_size, emb_size)
    
    for epoch in range(epochs):
        epoch_cost = 0
        batch_inds = list(range(0, m, batch_size))
        np.random.shuffle(batch_inds)
        for i in batch_inds:
            X_batch = X[:, i:i+batch_size]
            Y_batch = Y[:, i:i+batch_size]

            softmax_out, caches = forward_propagation(X_batch, parameters)
            gradients = backward_propagation(Y_batch, softmax_out, caches)
            update_parameters(parameters, caches, gradients, learning_rate)
            cost = cross_entropy(softmax_out, Y_batch)
            epoch_cost += np.squeeze(cost)
            
        costs.append(epoch_cost)
        if print_cost and epoch % (epochs // 500) == 0:
            print("Cost after epoch {}: {}".format(epoch, epoch_cost))
        if epoch % (epochs // 100) == 0:
            learning_rate *= 0.98
            
#     if plot_cost:
#         plt.plot(np.arange(epochs), costs)
#         plt.xlabel('# of epochs')
#         plt.ylabel('cost')
    return parameters

In [75]:
paras = skipgram_model_training(X, Y_one_hot, vocab_size, 50, 0.05, 500, batch_size=10, parameters=None, print_cost=True)

Cost after epoch 0: 5.531989247275737
Cost after epoch 1: 5.531315821486185
Cost after epoch 2: 5.530713268671494
Cost after epoch 3: 5.530144281171688
Cost after epoch 4: 5.5295128244333895
Cost after epoch 5: 5.528881068672856
Cost after epoch 6: 5.528250690166484
Cost after epoch 7: 5.5276404906755365
Cost after epoch 8: 5.526905910239879
Cost after epoch 9: 5.526302170186733
Cost after epoch 10: 5.525526894920241
Cost after epoch 11: 5.5248199194242265
Cost after epoch 12: 5.5240466117118165
Cost after epoch 13: 5.523156460330774
Cost after epoch 14: 5.522281438830421
Cost after epoch 15: 5.521405191577828
Cost after epoch 16: 5.520341746911295
Cost after epoch 17: 5.5193188243755404
Cost after epoch 18: 5.518196018274261
Cost after epoch 19: 5.516977657924286
Cost after epoch 20: 5.515662487320852
Cost after epoch 21: 5.514153464163479
Cost after epoch 22: 5.5127029799687195
Cost after epoch 23: 5.511040425120801
Cost after epoch 24: 5.5091793344437345
Cost after epoch 25: 5.50728

In [101]:
for input_ind in range(vocab_size):
    input_word = id_to_word[input_ind]
    output_words = [id_to_word[output_ind] for output_ind in top_sorted_inds[::-1, input_ind]]
    print("{}'s neighbor words: {}".format(input_word, output_words))

v2's neighbor words: ['v1']
v4's neighbor words: ['v3']
v1's neighbor words: ['v3']
v3's neighbor words: ['v1']


In [99]:
X_test = np.arange(vocab_size)
X_test = np.expand_dims(X_test, axis=0)
softmax_test, _ = forward_propagation(X_test, paras)
top_sorted_inds = np.argsort(softmax_test, axis=0)[-1:,:]

In [100]:
top_sorted_inds

array([[2, 3, 3, 2]])

In [91]:
np.argsort(softmax_test, axis=0)[-4:,:]

array([[0, 2, 2, 3],
       [1, 1, 1, 1],
       [3, 0, 0, 0],
       [2, 3, 3, 2]])

In [95]:
X_test

array([[0, 1, 2, 3]])

In [98]:
id_to_word

{0: 'v2', 1: 'v4', 2: 'v1', 3: 'v3'}

In [105]:
paras['WRD_EMB'].shape

(4, 50)

In [106]:
paras['W'].shape

(4, 50)

In [107]:
paras

{'WRD_EMB': array([[-0.54569614, -0.0380432 , -0.41602529,  0.70900913, -0.01911173,
         -0.66307915,  0.1314658 , -0.92234908, -0.07331426, -0.64262143,
         -0.10510148,  0.76957309,  0.30280831,  0.18995917, -0.68691935,
         -0.39689627,  0.1847396 ,  0.72625545, -0.13159031, -0.37917101,
          0.13890421,  0.38502223, -0.61587931, -0.19813591, -0.61027395,
         -0.44750954,  0.42822442,  0.05292631,  0.02114448, -0.55452888,
          0.29286201,  0.49168148, -0.5034531 ,  0.4204554 , -0.21743596,
          0.15460933,  0.16580675, -0.56009311, -0.59979153, -0.09200651,
         -0.92023894, -0.56448016,  0.09391948, -0.58702173,  0.05672181,
          0.03275275, -0.47276156, -0.22793464, -0.76066527,  0.08153289],
        [ 0.81453394, -0.14318716,  0.20034493, -0.56222349, -0.04659272,
          0.53552213,  0.07397529,  0.89777691,  0.15077317,  0.37466312,
          0.04265845, -0.63062387,  0.03950082,  0.29916115,  0.43349633,
          0.10590618, -0.1