In [11]:
import re
import nltk
import emoji
import numpy as np
from nltk.tokenize import word_tokenize

In [12]:
def get_dict(data):
    """
    Output:
        word2Ind: returns dictionary mapping the word to its index
        Ind2Word: returns dictionary mapping the index to its word
    """
    words = sorted(list(set(data)))
    n = len(words)
    idx = 0
    word2Ind = {}
    Ind2word = {}
    
    for k in words:
        word2Ind[k] = idx
        Ind2word[idx] = k
        idx += 1
        
    return word2Ind, Ind2word

## Data preparation

In the data preparation phase, starting with a corpus of text, you will:

- Clean and tokenize the corpus.

- Extract the pairs of context words and center word that will make up the training data set for the CBOW model. The context words are the features that will be fed into the model, and the center words are the target values that the model will learn to predict.

- Create simple vector representations of the context words (features) and center words (targets) that can be used by the neural network of the CBOW model.




In [25]:
def tokenize(corpus):
    corpus = re.sub(r'[,!?;-]+', '.', corpus)
    corpus = nltk.word_tokenize(corpus)
    corpus = [w.lower() for w in corpus]
    return corpus

In [52]:
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'
tokenize = tokenize(corpus)
print(tokenize)

['who', '❤️', '``', 'word', 'embeddings', "''", 'in', '2020', '.', 'i', 'do', '.']


## Sliding window of words

Now that you have transformed the corpus into a list of clean tokens, you can slide a window of words across this list. For each window you can extract a center word and the context words.


The first argument of this function is a list of words (or tokens). The second argument, C, is the context half-size. Recall that for a given center word, the context words are made of C words to the left and C words to the right of the center word.

In [28]:
def get_windows(words,C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i-C):i] + words[(i+1):(i+1+C)]
        yield center_word, context_words
        i += 1

In [48]:
for x,y in get_windows(["i","am","happy","because","i","am","learning"],2):
    print("Center word\n",x,"\nContext words\n", y,"\n")

Center word
 happy 
Context words
 ['i', 'am', 'because', 'i'] 

Center word
 because 
Context words
 ['am', 'happy', 'i', 'am'] 

Center word
 i 
Context words
 ['happy', 'because', 'am', 'learning'] 



## Transforming words into vectors for the training set

To finish preparing the training set, you need to transform the context words and center words into vectors.
Mapping words to indices and indices to words

The center words will be represented as one-hot vectors, and the vectors that represent context words are also based on one-hot vectors.

To create one-hot word vectors, you can start by mapping each unique word to a unique integer (or index). We have provided a helper function, get_dict, that creates a Python dictionary that maps words to integers and back.


In [87]:
word2Ind, Ind2word = get_dict(["i","am","happy","because","i","am","learning"])
word2Ind

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}

In [88]:
Ind2word

{0: 'am', 1: 'because', 2: 'happy', 3: 'i', 4: 'learning'}

In [89]:
def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector

In [91]:
word_to_one_hot_vector("learning",word2Ind,len(word2Ind))

array([0., 0., 0., 0., 1.])

In [96]:
def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

In [97]:
context_words_to_vector(['am', 'happy', 'i', 'am'], word2Ind, len(word2Ind))

array([0.5 , 0.  , 0.25, 0.25, 0.  ])