In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np

## Initialization

In [2]:
def initialize_wrd_emb(vocab_size, emb_size):
    WRD_EMB = np.random.uniform(size=(vocab_size, emb_size))
    return WRD_EMB

def initialize_dense(input_size, output_size):
    W = np.random.uniform(size=(output_size, input_size))
    b = np.random.uniform(size=(output_size, 1))
    return W, b

def initialize_parameters(vocab, emb_size):
    WRD_EMB = initialize_wrd_emb(vocab_size, emb_size)
    W, b = initialize_dense(emb_size, vocab_size)
    
    parameters = {}
    parameters['WRD_EMB'] = WRD_EMB
    parameters['W'] = W
    parameters['b'] = b
    
    return parameters

## Forward Propagation

In [3]:
def ind_to_word_vecs(inds, parameters):
    """
    inds -- shape: (CBOW_N, number of examples)
    """
    WRD_EMB = parameters['WRD_EMB']
    word_vecs = np.take(WRD_EMB, inds, axis=0)
    word_vecs = word_vecs.reshape(WRD_EMB.shape[1], inds.shape[0], -1)
    
    assert(word_vecs.shape == (WRD_EMB.shape[1], inds.shape[0], inds.shape[1]))
    
    return word_vecs

def average(word_vecs):
    word_vecs_avg = np.mean(word_vecs, axis=1)
    word_vecs_avg = word_vecs_avg.reshape(word_vecs.shape[0], -1)
    
    assert(word_vecs_avg.shape == (word_vecs.shape[0], word_vecs.shape[2]))
    
    return word_vecs_avg

def linear_dense(word_vecs_avg, parameters):
    W, b = parameters['W'], parameters['b']
    Z = np.dot(W, word_vecs_avg) + B
    
    assert(Z.shape == (W.shape[0], word_vecs_avg.shape[1]))
    
    return W, b, Z

def softmax(Z):
    softmax_out = np.divide(np.exp(Z), np.sum(np.exp(Z), axis=0))
    
    assert(softmax_out.shape == Z.shape)
    
    return softmax_out

def forward_propagation(inds, parameters):
    word_vecs = ind_to_word_vecs(inds, parameters)
    word_vecs_avg = average(word_vecs)
    W, b, Z = linear_dense(word_vecs_avg, parameters)
    softmax_out = softmax(Z)
    
    caches = {}
    caches['word_vecs'] = word_vecs
    caches['W'] = W
    caches['b'] = b
    
    return softmax_out

## Cost Function

In [None]:
def cross_entropy(softmax_out, Y):
    m = softmax_out.shapep[1]
    cost = -(1 / m) * np.sum(Y * np.log(softmax_out), axis=1)
    return cost

## Backward Propagation

In [25]:
import numpy as np

def word_embedding_layer(input_dim, output_dim, input_length):
    W = np.random.uniform(size=(input_dim, output_dim))
    return W

def dense(units, input_dim):
    W = np.random.uniform(size=(units, input_dim))
    b = np.random.uniform(size=(units, 1))
    return W, b

def softmax(Z):
    A = np.exp(Z)/np.sum(Z, axis=0)
    return A

def compute_cost(Y_pred, Y_true):
    m = Y_true.shape[0]
    return np.sum(-Y_true * np.log(Y_pred), axis=1)
    
def forward_pass(X, input_length, W_emb, W_dense, b_dense):
    emb_out = np.take(W_emb, X, axis=0).reshape(W_emb.shape[1], input_length, -1)
    print(emb_out.shape)
    mean_out = np.mean(emb_out, axis=1)
    print(mean_out.shape)
    dense_out = np.dot(W_dense, mean_out) + b_dense
    return dense_out

def compute_cost(dense_out, Y):
    m = Y.shape[1]
    cost = -(1/m) * np.sum(np.sum(Y * np.log(dense_out), axis=1), axis=0)
    assert(cost.shape == ())
    return cost

In [None]:
def softmax_backward(softmax_out, Y):
    dZ = softmax_out - Y
    return dZ # vocab_size x m

def dense_backward(dZ, W_dense, b_dense):
    m = W_dense.shape[1]
    dW = (1 / m) * dZ * W_dense.T  # 1 x (vocab_size x m) x (m x emb_size)
    db = (1 / m) * np.sum(dZ, axis=1)
    dA = 

### Toy data
Sentence: I(0) would(1) like(2) to(3) get(4) a(5) better(6) job(7).
vocab_size = 8
[0, 2] [1]
[1, 3] [2]
[2, 4] [3]
[3, 5] [4]
[4, 6] [5]
[5, 7] [6]

In [27]:
input_len = 2
vocab_size = 8
m = 6
emb_size = 15
X = np.array([[0, 1, 2, 3, 4, 5],
              [2, 3, 4, 5, 6, 7]]) # 2 x 6
Y = np.array([1, 2, 3, 4, 5, 6]) # 1 x 6
Y_one_hot = np.zeros((vocab_size, m))  # 8 x 6
Y_one_hot[Y.flatten(), np.arange(6)] = 1

W_emb = word_embedding_layer(vocab_size, emb_size, 2)
W_dense, b_dense = dense(vocab_size, emb_size)
dense_out = forward_pass(X, input_len, W_emb, W_dense, b_dense) # 8 x 6
softmax_out = softmax(dense_out) # 8 x 6
cost = compute_cost(softmax_out, Y_one_hot)
print(cost)

(15, 2, 6)
(15, 6)
-1.044234356730176


In [18]:
W_dense.shape

(8, 15)

input: (input_length, m) (4, 2)
w_emb: (vocab_size, embedding_size) (10, 3, 4 )
mean: (1, embedding_size)


In [30]:
Y_true = np.eye(5)[np.random.choice(5, 10)]
Y_hat = np.random.rand(10, 5)
Y_hat /= np.sum(Y_hat, axis=0)

In [37]:
cross_entropy(Y_hat, Y_true)

array([3.39684723, 4.07534568, 2.41349926, 2.5633011 , 2.00511614,
       2.18492226, 1.96788019, 2.22888813, 2.54548637, 2.24961894])

In [16]:
t = np.take(W, [[1, 3, 5, 7], [0, 1, 2, 3]], axis=0)

In [35]:
(-Y_true * np.log(Y_hat)).shape

(10, 5)

In [19]:
np.mean(t, axis=1).shape

(2, 3)

In [38]:
a = np.random.rand(10, 5)

In [40]:
a[::2]

array([[0.41495073, 0.19301629, 0.43419158, 0.96431508, 0.93404785],
       [0.06567073, 0.71629274, 0.4999399 , 0.58706599, 0.83155123],
       [0.96509983, 0.65287305, 0.09809129, 0.2517119 , 0.83942163],
       [0.21150526, 0.74722498, 0.53933994, 0.26333581, 0.83529272],
       [0.39386639, 0.88828519, 0.27778779, 0.47642352, 0.0420298 ]])

In [5]:
a = np.random.rand(10, 5)

In [6]:
a.shape

(10, 5)

In [8]:
np.sum(a, axis=1).shape

(10,)