In [30]:
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [31]:
import re
import nltk

nltk.download('punkt')

import emoji
import numpy as np
from nltk.tokenize import word_tokenize
from Lab_support.utils2 import get_dict

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/akshatgupta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
corpus = 'Who ❤️ "word embeddings" in 2020? I do!!!'

In [33]:
data = re.sub(r'[,!?;-]+', '.', corpus)
data

'Who ❤️ "word embeddings" in 2020. I do.'

In [34]:
data = nltk.word_tokenize(data)
data

['Who',
 '❤️',
 '``',
 'word',
 'embeddings',
 "''",
 'in',
 '2020',
 '.',
 'I',
 'do',
 '.']

In [35]:
data = [ch.lower() for ch in data
        if ch.isalpha()
        or ch == '.'
        or emoji.get_emoji_regexp().search(ch)]
data


['who', '❤️', 'word', 'embeddings', 'in', '.', 'i', 'do', '.']

In [36]:
def tokenize(corpus):
    data = re.sub(r'[,!?;-]+', '.', corpus)
    data = nltk.word_tokenize(data)
    data = [ch.lower() for ch in data
            if ch.isalpha()
            or ch == '.'
            or emoji.get_emoji_regexp().search(ch)]
    return data

In [37]:
corpus = 'I am happy because I am learning'
words = tokenize(corpus)
words

['i', 'am', 'happy', 'because', 'i', 'am', 'learning']

In [38]:
def get_windows(words, C):
    i = C
    while i < len(words) - C:
        center_word = words[i]
        context_words = words[(i - C):i] + words[(i+1):(i+C+1)]
        yield context_words, center_word
        i += 1

In [39]:
for x,y in get_windows(words,2):
    print(f"{y}\t{x}")

happy	['i', 'am', 'because', 'i']
because	['am', 'happy', 'i', 'am']
i	['happy', 'because', 'am', 'learning']


In [40]:
sentence = "Now it's your turn: try with your own sentence!"
for x,y in get_windows(tokenize(sentence),3):
    print(f"{y}\t{x}")

turn	['now', 'it', 'your', 'try', 'with', 'your']
try	['it', 'your', 'turn', 'with', 'your', 'own']
with	['your', 'turn', 'try', 'your', 'own', 'sentence']
your	['turn', 'try', 'with', 'own', 'sentence', '.']


In [41]:
word2Ind, Ind2word = get_dict(words)

In [42]:
Ind2word

{0: 'am', 1: 'because', 2: 'happy', 3: 'i', 4: 'learning'}

In [43]:
word2Ind

{'am': 0, 'because': 1, 'happy': 2, 'i': 3, 'learning': 4}

In [44]:
V = len(word2Ind)
V

5

In [45]:
n = word2Ind['happy']
n

2

In [46]:
center_word_vector_happy = np.zeros(V)
center_word_vector_happy[n] = 1

In [47]:
def word_to_one_hot_vector(word, word2Ind, V):
    one_hot_vector = np.zeros(V)
    one_hot_vector[word2Ind[word]] = 1
    return one_hot_vector

In [48]:
word_to_one_hot_vector('happy', word2Ind, V)

array([0., 0., 1., 0., 0.])

In [49]:
word_to_one_hot_vector('learning', word2Ind, V)

array([0., 0., 0., 0., 1.])

In [50]:
context_words = ['i', 'am', 'because', 'i']

In [51]:
context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
np.array(context_words_vectors).T

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 1.],
       [0., 0., 0., 0.]])

In [52]:
np.mean(context_words_vectors, axis=0)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [53]:
def context_words_to_vector(context_words, word2Ind, V):
    context_words_vectors = [word_to_one_hot_vector(w, word2Ind, V) for w in context_words]
    context_words_vectors = np.mean(context_words_vectors, axis=0)
    return context_words_vectors

In [54]:
context_words_to_vector(['i', 'am', 'because', 'i'], word2Ind, V)

array([0.25, 0.25, 0.  , 0.5 , 0.  ])

In [55]:
context_words_to_vector(['am', 'happy', 'i', 'am'], word2Ind, V)

array([0.5 , 0.  , 0.25, 0.25, 0.  ])

In [62]:
for context_words, center_word in get_windows(words, 2):  # reminder: 2 is the context half-size
    print(f'Context words:\t{context_words} -> {context_words_to_vector(context_words, word2Ind, V)}')
    print(f'Center word:\t{center_word} -> {word_to_one_hot_vector(center_word, word2Ind, V)}')
    print()

Context words:	['i', 'am', 'because', 'i'] -> [0.25 0.25 0.   0.5  0.  ]
Center word:	happy -> [0. 0. 1. 0. 0.]

Context words:	['am', 'happy', 'i', 'am'] -> [0.5  0.   0.25 0.25 0.  ]
Center word:	because -> [0. 1. 0. 0. 0.]

Context words:	['happy', 'because', 'am', 'learning'] -> [0.25 0.25 0.25 0.   0.25]
Center word:	i -> [0. 0. 0. 1. 0.]



In [59]:
def get_training_example(words, C, word2Ind, V):
    for context_words, center_word in get_windows(words, C):
        yield context_words_to_vector(context_words, word2Ind, V), word_to_one_hot_vector(center_word, word2Ind, V)

In [61]:
for context_words_vector, center_word_vector in get_training_example(words, 2, word2Ind, V):
    print(f'Context words vector:\t{context_words_vector}')
    print(f'Center word vector:\t{center_word_vector}')
    print()

Context words vector:	[0.25 0.25 0.   0.5  0.  ]
Center word vector:	[0. 0. 1. 0. 0.]

Context words vector:	[0.5  0.   0.25 0.25 0.  ]
Center word vector:	[0. 1. 0. 0. 0.]

Context words vector:	[0.25 0.25 0.25 0.   0.25]
Center word vector:	[0. 0. 0. 1. 0.]

