In [30]:
import re
import nltk

nltk.download('punkt')

import emoji
import numpy as np
from nltk.tokenize import word_tokenize
from utils2 import get_dict

import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moidhassan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
emoji.__version__

'1.4.1'

### Data preprocessing

In [3]:
# Define a corpus
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'

# Print original corpus
print(f'Corpus:  {corpus}')

Corpus:  Who ❤️ "word embeddings" in 2020? I do!!!


In [4]:
# Define the 'tokenize' function that will include the steps previously seen
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = nltk.word_tokenize(data)  # tokenize string to words
    data = [ ch.lower() for ch in data
             if ch.isalpha()
             or ch == '.'
             or emoji.get_emoji_regexp().search(ch)
           ]
    return data

In [6]:
# Print new corpus
print(f'Corpus:  {corpus}')

# Save tokenized version of corpus into 'words' variable
words = tokenize(corpus)

# Print the tokenized version of the corpus
print(f'Words (tokens):  {words}')

Corpus:  Who ❤️ "word embeddings" in 2020? I do!!!
Words (tokens):  ['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']


In [7]:
# Run this with any sentence
tokenize("I am happy because I am learning")

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

### Sliding window of words

In [8]:
# Define the 'get_windows' function
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i += 1

In [9]:
# Print 'context_words' and 'center_word' for the new corpus with a 'context half-size' of 2
for x, y in get_windows(['i', 'am', 'happy', 'because', 'i', 'am', 'learning'], 2):
    print(f'{x}\t{y}')

['i', 'am', 'because', 'i']	happy
['am', 'happy', 'i', 'am']	because
['happy', 'because', 'am', 'learning']	i


In [10]:
# Print 'context_words' and 'center_word' for any sentence with a 'context half-size' of 1
for x, y in get_windows(tokenize(corpus), 1):
    print(f'{x}\t{y}')

['who', 'word']	❤️
['❤️', 'embeddings']	word
['word', 'in']	embeddings
['embeddings', '.']	in
['in', 'i']	.
['.', 'do']	i
['i', '.']	do


### Transforming words into vectors for training set

In [14]:
# Get 'word2Ind' and 'Ind2word' dictionaries for the tokenized corpus

# Define new corpus
corpus = 'I am happy because I am learning'

# Print new corpus
print(f'Corpus:  {corpus}')

# Save tokenized version of corpus into 'words' variable
words = tokenize(corpus)
word2Ind, Ind2word = get_dict(words)

print(word2Ind)

# Print value for the key 'i' within word2Ind dictionary
print("Index of the word 'i':  ",word2Ind['i'])

# Print 'Ind2word' dictionary
print(Ind2word)

# Print value for the key '2' within Ind2word dictionary
print("Word which has index 2:  ",Ind2word[2] )

# Save length of word2Ind dictionary into the 'V' variable
V = len(word2Ind)

# Print length of word2Ind dictionary
print("Size of vocabulary: ", V)

Corpus:  I am happy because I am learning
{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}
Index of the word 'i':   3
{0: 'am', 1: 'because', 2: 'happy', 3: 'i', 4: 'learning'}
Word which has index 2:   happy
Size of vocabulary:  5


In [16]:
# Define the 'word_to_one_hot_vector' function that will include the steps previously seen
def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector

In [20]:
# Print output of 'word_to_one_hot_vector' function for word 'happy'
print(word_to_one_hot_vector('happy', word2Ind, V))

# Print output of 'word_to_one_hot_vector' function for word 'learning'
print(word_to_one_hot_vector('learning', word2Ind, V))

[0. 0. 1. 0. 0.]
[0. 0. 0. 0. 1.]


#### Getting context word vectors

In [21]:
# Define list containing context words
context_words = ['i', 'am', 'because', 'i']

# Create one-hot vectors for each context word using list comprehension
context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]

# Print one-hot vectors for each context word
context_words_vectors

[array([0., 0., 0., 1., 0.]),
 array([1., 0., 0., 0., 0.]),
 array([0., 1., 0., 0., 0.]),
 array([0., 0., 0., 1., 0.])]

In [22]:
# Compute mean of the vectors using numpy
np.mean(context_words_vectors, axis=0)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [23]:
# Define the 'context_words_to_vector' function that will include the steps previously seen
def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

In [24]:
# Print output of 'context_words_to_vector' function for context words: 'i', 'am', 'because', 'i'
context_words_to_vector(['i', 'am', 'because', 'i'], word2Ind, V)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [25]:
# Print output of 'context_words_to_vector' function for context words: 'am', 'happy', 'i', 'am'
context_words_to_vector(['am', 'happy', 'i', 'am'], word2Ind, V)

array([0.5 , 0.  , 0.25, 0.25, 0.  ])

### Building the training set

In [26]:
# Print corpus
words

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

In [27]:
# Print vectors associated to center and context words for corpus
for context_words, center_word in get_windows(words, 2):  # reminder: 2 is the context half-size
    print(f'Context words:  {context_words} -> {context_words_to_vector(context_words, word2Ind, V)}')
    print(f'Center word:  {center_word} -> {word_to_one_hot_vector(center_word, word2Ind, V)}')
    print()

Context words:  ['i', 'am', 'because', 'i'] -> [0.25 0.25 0.   0.5  0.  ]
Center word:  happy -> [0. 0. 1. 0. 0.]

Context words:  ['am', 'happy', 'i', 'am'] -> [0.5  0.   0.25 0.25 0.  ]
Center word:  because -> [0. 1. 0. 0. 0.]

Context words:  ['happy', 'because', 'am', 'learning'] -> [0.25 0.25 0.25 0.   0.25]
Center word:  i -> [0. 0. 0. 1. 0.]



In [28]:
# Define the generator function 'get_training_example'
def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

In [29]:
# Print vectors associated to center and context words for corpus using the generator function
for context_words_vector, center_word_vector in get_training_example(words, 2, word2Ind, V):
    print(f'Context words vector:  {context_words_vector}')
    print(f'Center word vector:  {center_word_vector}')
    print()

Context words vector:  [0.25 0.25 0.   0.5  0.  ]
Center word vector:  [0. 0. 1. 0. 0.]

Context words vector:  [0.5  0.   0.25 0.25 0.  ]
Center word vector:  [0. 1. 0. 0. 0.]

Context words vector:  [0.25 0.25 0.25 0.   0.25]
Center word vector:  [0. 0. 0. 1. 0.]



### ReLu and Softmax activation functions

In [31]:
# Define the 'relu' function that will include the steps previously seen
def relu(z):
    result = z.copy()
    result[result < 0] = 0
    return result

In [32]:
# Define a new vector and save it in the 'z' variable
z = np.array([[-1.25459881], [ 4.50714306], [ 2.31993942], [ 0.98658484], [-3.4398136 ]])

# Apply ReLU to it
relu(z)

array([[0.        ],
       [4.50714306],
       [2.31993942],
       [0.98658484],
       [0.        ]])

In [33]:
# Define the 'softmax' function that will include the steps previously seen
def softmax(z):
    e_z = np.exp(z)
    sum_e_z = np.sum(e_z)
    return e_z / sum_e_z

In [35]:
# Print softmax values for original vector
softmax([9, 8, 11, 10, 8.5])

print(np.sum(softmax([9, 8, 11, 10, 8.5])) == 1)

True


### Forward propagation

In [36]:
# Define the size of the word embedding vectors and save it in the variable 'N'
N = 3

# Define V. Remember this was the size of the vocabulary in the previous lecture notebooks
V = 5

In [37]:
# Define first matrix of weights
W1 = np.array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
               [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
               [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])

# Define second matrix of weights
W2 = np.array([[-0.22182064, -0.43008631,  0.13310965],
               [ 0.08476603,  0.08123194,  0.1772054 ],
               [ 0.1871551 , -0.06107263, -0.1790735 ],
               [ 0.07055222, -0.02015138,  0.36107434],
               [ 0.33480474, -0.39423389, -0.43959196]])

# Define first vector of biases
b1 = np.array([[ 0.09688219],
               [ 0.29239497],
               [-0.27364426]])

# Define second vector of biases
b2 = np.array([[ 0.0352008 ],
               [-0.36393384],
               [-0.12775555],
               [-0.34802326],
               [-0.07017815]])

print(f'V (vocabulary size): {V}')
print(f'N (embedding size / size of the hidden layer): {N}')
print(f'size of W1: {W1.shape} (NxV)')
print(f'size of b1: {b1.shape} (Nx1)')
print(f'size of W2: {W2.shape} (VxN)')
print(f'size of b2: {b2.shape} (Vx1)')

In [38]:
print(f'V (vocabulary size): {V}')
print(f'N (embedding size / size of the hidden layer): {N}')
print(f'size of W1: {W1.shape} (NxV)')
print(f'size of b1: {b1.shape} (Nx1)')
print(f'size of W2: {W2.shape} (VxN)')
print(f'size of b2: {b2.shape} (Vx1)')

V (vocabulary size): 5
N (embedding size / size of the hidden layer): 3
size of W1: (3, 5) (NxV)
size of b1: (3, 1) (Nx1)
size of W2: (5, 3) (VxN)
size of b2: (5, 1) (Vx1)


In [39]:
# Define the tokenized version of the corpus
words = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

# Get 'word2Ind' and 'Ind2word' dictionaries for the tokenized corpus
word2Ind, Ind2word = get_dict(words)

# Define the 'get_windows' function as seen in a previous notebook
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i += 1

# Define the 'word_to_one_hot_vector' function as seen in a previous notebook
def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector

# Define the 'context_words_to_vector' function as seen in a previous notebook
def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

# Define the generator function 'get_training_example' as seen in a previous notebook
def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

#### Training Example

In [44]:
# Save generator object in the 'training_examples' variable with the desired arguments
training_examples = get_training_example(words, 2, word2Ind, V)

# Get first values from generator
x_array, y_array = next(training_examples)

# Print context words vector
print(x_array)

# Print one hot vector of center word
print(y_array)

[0.25 0.25 0.   0.5  0.  ]
[0. 0. 1. 0. 0.]


In [45]:
# Copy vector
x = x_array.copy()

# Reshape it
x.shape = (V, 1)

# Print it
print(f'x:\n{x}\n')

# Copy vector
y = y_array.copy()

# Reshape it
y.shape = (V, 1)

# Print it
print(f'y:\n{y}')

x:
[[0.25]
 [0.25]
 [0.  ]
 [0.5 ]
 [0.  ]]

y:
[[0.]
 [0.]
 [1.]
 [0.]
 [0.]]


In [46]:
# Define the 'relu' function as seen in the previous lecture notebook
def relu(z):
    result = z.copy()
    result[result < 0] = 0
    return result

# Define the 'softmax' function as seen in the previous lecture notebook
def softmax(z):
    e_z = np.exp(z)
    sum_e_z = np.sum(e_z)
    return e_z / sum_e_z

In [73]:
def forward(w1,b1,w2,b2,x):
    
    z1 = np.dot(w1,x) + b1
    h = relu(z1)
    z2 = np.dot(w2,h) + b2
    y_hat = softmax(z2)

    return y_hat,h

def cross_entropy_loss(y_predicted, y_actual):
    # Fill the loss variable with your code
    loss = np.sum(-np.log(y_predicted)*y_actual)
    return loss

In [74]:
y_hat,h = forward(W1,b1,W2,b2,x)
# Print value of cross entropy loss for prediction and target value
print(cross_entropy_loss(y_hat, y))


1.4650152923611106


### Backpropagation

In [69]:
def backprop(y,y_hat,h,w1,b1,w2,b2,x,alpha = 0.03):

    grad_b2 = y_hat - y
    grad_W2 = np.dot(y_hat - y, h.T)
    grad_b1 = relu(np.dot(w2.T, y_hat - y))
    grad_W1 = np.dot(relu(np.dot(w2.T, y_hat - y)), x.T)

    W1_new = w1 - alpha * grad_W1
    W2_new = w2 - alpha * grad_W2
    b1_new = b1 - alpha * grad_b1
    b2_new = b2 - alpha * grad_b2

    return W1_new,b1_new,W2_new,b2_new
    
    

In [70]:
W1_new,b1_new,W2_new,b2_new = backprop(y,y_hat,h,W1,b1,W2,b2,x)

In [75]:
y_hat,h = forward(W1_new,b1_new,W2_new,b2_new,x)
# Print value of cross entropy loss for prediction and target value
print(cross_entropy_loss(y_hat, y))

1.4310498009586927


### Forward and backward prop with epochs=10

In [81]:
N = 3
V = 5
# Define first matrix of weights
W1 = np.array([[ 0.41687358,  0.08854191, -0.23495225,  0.28320538,  0.41800106],
               [ 0.32735501,  0.22795148, -0.23951958,  0.4117634 , -0.23924344],
               [ 0.26637602, -0.23846886, -0.37770863, -0.11399446,  0.34008124]])

# Define second matrix of weights
W2 = np.array([[-0.22182064, -0.43008631,  0.13310965],
               [ 0.08476603,  0.08123194,  0.1772054 ],
               [ 0.1871551 , -0.06107263, -0.1790735 ],
               [ 0.07055222, -0.02015138,  0.36107434],
               [ 0.33480474, -0.39423389, -0.43959196]])

# Define first vector of biases
b1 = np.array([[ 0.09688219],
               [ 0.29239497],
               [-0.27364426]])

# Define second vector of biases
b2 = np.array([[ 0.0352008 ],
               [-0.36393384],
               [-0.12775555],
               [-0.34802326],
               [-0.07017815]])

print(f'V (vocabulary size): {V}')
print(f'N (embedding size / size of the hidden layer): {N}')
print(f'size of W1: {W1.shape} (NxV)')
print(f'size of b1: {b1.shape} (Nx1)')
print(f'size of W2: {W2.shape} (VxN)')
print(f'size of b2: {b2.shape} (Vx1)')
for i in range(10):
    if i == 0:
        y_hat,h = forward(W1,b1,W2,b2,x)
        # Print value of cross entropy loss for prediction and target value
        print(f"loss at 1st iteration {cross_entropy_loss(y_hat, y)}")
    elif i == 9:
        y_hat,h = forward(W1,b1,W2,b2,x)
        print(f"loss at last iteration {cross_entropy_loss(y_hat, y)}")
    else:
        W1,b1,W2,b2 = backprop(y,y_hat,h,W1,b1,W2,b2,x)
        y_hat,h = forward(W1,b1,W2,b2,x)
        print(f"loss at {i+1} iteration {cross_entropy_loss(y_hat, y)}")

V (vocabulary size): 5
N (embedding size / size of the hidden layer): 3
size of W1: (3, 5) (NxV)
size of b1: (3, 1) (Nx1)
size of W2: (5, 3) (VxN)
size of b2: (5, 1) (Vx1)
loss at 1st iteration 1.4650152923611106
loss at 2 iteration 1.4310498009586927
loss at 3 iteration 1.397790348042014
loss at 4 iteration 1.3652380995335804
loss at 5 iteration 1.3333932111291222
loss at 6 iteration 1.302254837755199
loss at 7 iteration 1.2718211490382132
loss at 8 iteration 1.242089350518089
loss at 9 iteration 1.213055710269879
loss at last iteration 1.213055710269879


### Extracting word embeddings

In [82]:
y_hat,h = forward(W1,b1,W2,b2,x)
print(f"loss is {cross_entropy_loss(y_hat, y)}")

loss is 1.213055710269879


In [83]:
for i in range(V):
    print(Ind2word[i])

am
because
happy
i
learning


In [84]:
# loop through each word of the vocabulary
for word in word2Ind:
    # extract the column corresponding to the index of the word in the vocabulary
    word_embedding_vector = W1[:, word2Ind[word]]
    
    print(f'{word}: {word_embedding_vector}')

am: [0.41687358 0.32735501 0.25650771]
because: [ 0.08854191  0.22795148 -0.24833717]
happy: [-0.23495225 -0.23951958 -0.37770863]
i: [ 0.28320538  0.4117634  -0.13373108]
learning: [ 0.41800106 -0.23924344  0.34008124]


In [85]:
# loop through each word of the vocabulary
for word in word2Ind:
    # extract the column corresponding to the index of the word in the vocabulary
    word_embedding_vector = W2.T[:, word2Ind[word]]
    
    print(f'{word}: {word_embedding_vector}')

am: [-0.23745169 -0.45738218  0.13310965]
because: [0.06853979 0.05289671 0.1772054 ]
happy: [ 0.25198117  0.05213053 -0.1790735 ]
i: [ 0.05515313 -0.04704219  0.36107434]
learning: [ 0.31723504 -0.42491515 -0.43959196]


In [86]:
W3 = (W1+W2.T)/2

# loop through each word of the vocabulary
for word in word2Ind:
    # extract the column corresponding to the index of the word in the vocabulary
    word_embedding_vector = W3[:, word2Ind[word]]
    
    print(f'{word}: {word_embedding_vector}')

am: [ 0.08971095 -0.06501358  0.19480868]
because: [ 0.07854085  0.1404241  -0.03556588]
happy: [ 0.00851446 -0.09369452 -0.27839106]
i: [0.16917926 0.18236061 0.11367163]
learning: [ 0.36761805 -0.33207929 -0.04975536]
