In [1]:
import numpy as np
from Lab_support.utils2 import get_dict

In [2]:
N = 3
V = 5

In [3]:
W1 = np.array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
               [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
               [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])

W2 = np.array([[-0.22182064, -0.43008631,  0.13310965],
               [ 0.08476603,  0.08123194,  0.1772054 ],
               [ 0.1871551 , -0.06107263, -0.1790735 ],
               [ 0.07055222, -0.02015138,  0.36107434],
               [ 0.33480474, -0.39423389, -0.43959196]])

b1 = np.array([[ 0.09688219],
               [ 0.29239497],
               [-0.27364426]])

b2 = np.array([[ 0.0352008 ],
               [-0.36393384],
               [-0.12775555],
               [-0.34802326],
               [-0.07017815]])

In [4]:
print(f'V (vocabulary size): {V}')
print(f'N (embedding size): {N}')
print(f'size of W1: {W1.shape} (NxV)')
print(f'size of b1: {b1.shape} (Nx1)')
print(f'size of W2: {W2.shape} (VxN)')
print(f'size of b2: {b2.shape} (Vx1)')

V (vocabulary size): 5
N (embedding size): 3
size of W1: (3, 5) (NxV)
size of b1: (3, 1) (Nx1)
size of W2: (5, 3) (VxN)
size of b2: (5, 1) (Vx1)


In [5]:
words = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']
word2Ind, Ind2word = get_dict(words)

In [6]:
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i += 1

def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector

def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

In [7]:
training_examples = get_training_example(words, 2, word2Ind, V)
training_examples

<generator object get_training_example at 0x108f11770>

In [8]:
x_array, y_array = next(training_examples)
x_array, y_array

(array([0.25, 0.25, 0.  , 0.5 , 0.  ]), array([0., 0., 1., 0., 0.]))

In [9]:
x = x_array.reshape((V,1))
y = y_array.reshape((V,1))

In [10]:
def relu(z):
    result = z.copy()
    result[result < 0] = 0
    return result

def softmax(z):
    e_z = np.exp(z)
    sum_e_z = np.sum(e_z)
    return e_z / sum_e_z

In [11]:
z1 = np.dot(W1, x) + b1
h = relu(z1)

In [12]:
z1

array([[ 0.36483875],
       [ 0.63710329],
       [-0.3236647 ]])

In [13]:
h

array([[0.36483875],
       [0.63710329],
       [0.        ]])

In [14]:
z2 = np.dot(W2,h) + b2
y_hat = softmax(z2)

In [15]:
z2

array([[-0.31973737],
       [-0.28125477],
       [-0.09838369],
       [-0.33512159],
       [-0.19919612]])

In [16]:
y_hat

array([[0.18519074],
       [0.19245626],
       [0.23107446],
       [0.18236353],
       [0.20891502]])

In [17]:
Ind2word[np.argmax(y_hat)]

'happy'

In [18]:
y

array([[0.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [19]:
def cross_entropy_loss(y_predicted, y_actual):
    loss = -np.sum(y_actual*np.log(y_predicted))
    return loss

In [20]:
cross_entropy_loss(y_hat, y)

1.4650152923611106

In [21]:
grad_b2 = y_hat - y
grad_b2

array([[ 0.18519074],
       [ 0.19245626],
       [-0.76892554],
       [ 0.18236353],
       [ 0.20891502]])

In [22]:
grad_W2 = np.dot(y_hat - y, h.T)
grad_W2

array([[ 0.06756476,  0.11798563,  0.        ],
       [ 0.0702155 ,  0.12261452,  0.        ],
       [-0.28053384, -0.48988499,  0.        ],
       [ 0.06653328,  0.1161844 ,  0.        ],
       [ 0.07622029,  0.13310045,  0.        ]])

In [23]:
grad_b1 = relu(np.dot(W2.T, y_hat - y))
grad_b1

array([[0.        ],
       [0.        ],
       [0.17045858]])

In [24]:
grad_W1 = np.dot(relu(np.dot(W2.T, y_hat - y)), x.T)
grad_W1

array([[0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.04261464, 0.04261464, 0.        , 0.08522929, 0.        ]])

In [25]:
alpha = 0.03

In [26]:
W1 = W1 - alpha * grad_W1
b1 = b1 - alpha * grad_b1
W2 = W2 - alpha * grad_W2
b2 = b2 - alpha * grad_b2

In [27]:
W1

array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
       [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
       [ 0.26509758, -0.2397473 , -0.37770863, -0.11655134,  0.34008124]])

In [28]:
z1_new = np.dot(W1,x) + b1
h_new = relu(z1_new)
z2_new = np.dot(W2,h) + b2
y_hat_new = softmax(z2_new)
y_hat_new

array([[0.18333224],
       [0.19046095],
       [0.23905783],
       [0.18055697],
       [0.20659202]])

In [29]:
cross_entropy_loss(y_hat_new,y)

1.4310498009586927

### Option 1

In [30]:
W1

array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
       [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
       [ 0.26509758, -0.2397473 , -0.37770863, -0.11655134,  0.34008124]])

In [32]:
for word in word2Ind:
    word_embedding_vector = W1[:, word2Ind[word]]
    print(f'{word}: {word_embedding_vector}')

am: [0.41687358 0.32735501 0.26509758]
because: [ 0.08854191  0.22795148 -0.2397473 ]
happy: [-0.23495225 -0.23951958 -0.37770863]
i: [ 0.28320538  0.4117634  -0.11655134]
learning: [ 0.41800106 -0.23924344  0.34008124]


### Option 2

In [35]:
W2

array([[-0.22384758, -0.43362588,  0.13310965],
       [ 0.08265956,  0.0775535 ,  0.1772054 ],
       [ 0.19557112, -0.04637608, -0.1790735 ],
       [ 0.06855622, -0.02363691,  0.36107434],
       [ 0.33251813, -0.3982269 , -0.43959196]])

In [36]:
for word in word2Ind:
    word_embedding_vector = W2[word2Ind[word],:]
    print(f'{word}: {word_embedding_vector}')

am: [-0.22384758 -0.43362588  0.13310965]
because: [0.08265956 0.0775535  0.1772054 ]
happy: [ 0.19557112 -0.04637608 -0.1790735 ]
i: [ 0.06855622 -0.02363691  0.36107434]
learning: [ 0.33251813 -0.3982269  -0.43959196]


### Option 3

In [37]:
W3 = 0.5 * (W1.T + W2)

In [38]:
for word in word2Ind:
    word_embedding_vector = W3[word2Ind[word],:]
    print(f'{word}: {word_embedding_vector}')

am: [ 0.096513   -0.05313543  0.19910362]
because: [ 0.08560074  0.15275249 -0.03127095]
happy: [-0.01969057 -0.14294783 -0.27839106]
i: [0.1758808  0.19406324 0.1222615 ]
learning: [ 0.3752596  -0.31873517 -0.04975536]
