In [None]:
!pip install keras
!pip install tensorflow
!pip install plot_keras_history
!pip install seaborn

In [None]:

from keras.utils import np_utils
from keras.preprocessing import sequence

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda, Reshape
from keras.layers import Input
from keras.models import Model
from keras.layers import dot
from tensorflow.keras.activations import relu
from nltk import word_tokenize, sent_tokenize
from gensim.corpora.dictionary import Dictionary
import numpy as np

from keras.preprocessing.sequence import skipgrams
import gensim


Using TensorFlow backend.


In [None]:
# using nltk tokenizer.  
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
#Data Preparation 

AlotOftext = """Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in corpora, 
the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish that it is not true. In
corpus studies, we frequently do have enough data, so the fact that a relation 
between two phenomena is demonstrably non-random, does not support the inference 
that it is not arbitrary. We present experimental evidence
of how arbitrary associations between word frequencies and corpora are
systematically non-random. We review literature in which hypothesis testing 
has been used, and show how it has often led to unhelpful or misleading results.""".lower()



#Tokenize text
tokenized_text = [word_tokenize(sent) for sent in sent_tokenize(AlotOftext)]

#Create Vocab as a Dictionary
vocab = Dictionary(tokenized_text)
print(dict(vocab.items()))

print(vocab.token2id['corpora'])
print(vocab[2])
sent0 = tokenized_text[0]
print(vocab.doc2idx(sent0))

vocab.add_documents([['PAD']])
dict(vocab.items())
print(vocab.token2id['PAD'])

corpusByWordID = list()
for sent in  tokenized_text:
    corpusByWordID.append(vocab.doc2idx(sent))

vocab_size = len(vocab)
embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(vocab.items())[:10])

{0: ',', 1: '.', 2: 'and', 3: 'choose', 4: 'essentially', 5: 'is', 6: 'language', 7: 'never', 8: 'non-random', 9: 'randomly', 10: 'users', 11: 'words', 12: 'a', 13: 'hypothesis', 14: 'null', 15: 'posits', 16: 'randomness', 17: 'statistical', 18: 'testing', 19: 'uses', 20: 'which', 21: 'at', 22: 'be', 23: 'corpora', 24: 'hence', 25: 'in', 26: 'linguistic', 27: 'look', 28: 'phenomena', 29: 'the', 30: 'true', 31: 'we', 32: 'when', 33: 'will', 34: '(', 35: ')', 36: 'able', 37: 'almost', 38: 'always', 39: 'data', 40: 'enough', 41: 'establish', 42: 'it', 43: 'moreover', 44: 'not', 45: 'shall', 46: 'that', 47: 'there', 48: 'to', 49: 'where', 50: 'arbitrary', 51: 'between', 52: 'corpus', 53: 'demonstrably', 54: 'do', 55: 'does', 56: 'fact', 57: 'frequently', 58: 'have', 59: 'inference', 60: 'relation', 61: 'so', 62: 'studies', 63: 'support', 64: 'two', 65: 'are', 66: 'associations', 67: 'evidence', 68: 'experimental', 69: 'frequencies', 70: 'how', 71: 'of', 72: 'present', 73: 'systematically',

In [None]:
# Create CBOW Training data
def generate_cbow_context_word_pairs(corpusByID, window_size, vocab_size):
    context_length = window_size*2
    X=[]
    Y=[]
    for sent in corpusByID:
        sentence_length = len(sent)
        for index, word in enumerate(sent):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([sent[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)
            if start<0:
                x = sequence.pad_sequences(context_words, maxlen=context_length,padding='pre',value=vocab.token2id['PAD'])
                y = np_utils.to_categorical(label_word, vocab_size)
                X.append(x)
                Y.append(y)
                continue
            if end>=sentence_length:
                x = sequence.pad_sequences(context_words, maxlen=context_length,padding='post',value=vocab.token2id['PAD'])
                y = np_utils.to_categorical(label_word, vocab_size)
                X.append(x)
                Y.append(y)
                continue
            else:
                X.append(sequence.pad_sequences(context_words, maxlen=context_length))
                Y.append(y)
                continue
           
    return X,Y
            
# Test this out for some samples


X,Y = generate_cbow_context_word_pairs(corpusByWordID, window_size, vocab_size) 
   
for x, y in zip(X,Y):
    print('Context (X):', [vocab[w] for w in x[0]], '-> Target (Y):', vocab[np.argwhere(y[0])[0][0]])


In [None]:
#define the model
cbow = Sequential()
###hint:output_dim = the shape of embedding matrix
###hint:input_length = the length of training sample
cbow.add(Embedding(input_dim=vocab_size, output_dim=???, input_length=???))
cbow.add(Lambda(lambda x: relu(K.mean(x, axis=1)), output_shape=(embed_size,)))
###hint:the total numbser of possible labels/words
###hint:activation='softmax' or 'sigmoid'
cbow.add(Dense(???, activation='???'))
###hint:loss='categorical_crossentropy' or 'binary_crossentropy'
cbow.compile(loss='???', optimizer='sgd')
cbow.summary()



In [None]:
#Train the model

for epoch in range(1000):
    loss = 0.
    for x, y in zip(X,Y):
        loss += cbow.train_on_batch(x, y)
    print(epoch, loss)

In [None]:
## Save the wordvectors
f = open('Cbow_vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = cbow.get_weights()[0]
for key in vocab:
    str_vec = ' '.join(map(str, list(vectors[key, :])))
    f.write('{} {}\n'.format(vocab[key], str_vec))
f.close()

In [None]:
## Load the vectors back and validate
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

w2v.most_similar(positive=['that'])
w2v.most_similar(negative=['that'])

In [None]:
#Create Skipgram Training data 

# generate skip-grams with both positive and negative examples
skip_grams = [skipgrams(sent, vocabulary_size=vocab_size, window_size=2) for sent in corpusByWordID]

# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
        vocab[pairs[i][0]], pairs[i][0],           
        vocab[pairs[i][1]], pairs[i][1], 
        labels[i]))

In [None]:
#define the skip-gram model

#define the model
input_word = Input((1,))
input_context_word = Input((1,))

word_embedding = Embedding(input_dim=vocab_size, output_dim=???,input_length=1,name='word_embedding')
context_embedding = Embedding(input_dim=vocab_size, output_dim=???,input_length=1,name='conotext_embedding')

word_embedding = word_embedding(input_word)
word_embedding_layer = Reshape((embed_size, 1))(word_embedding)

context_embedding = context_embedding(input_context_word)
context_embedding_layer = Reshape((embed_size, 1))(context_embedding)

# now perform the dot product operation word_embedding_vec * context_embedding_vec
dot_product = dot([???, ???], axes=1)
dot_product = Reshape((1,))(dot_product)

###hint:the total number of possible labels/words
###hint:activation='softmax' or 'sigmoid'
outputLayer = Dense(???, activation='???')(dot_product)

model = Model(input=[input_word, input_context_word], output=outputLayer)

###hint:loss='categorical_crossentropy' or 'binary_crossentropy'
model.compile(loss='???', optimizer='adam')

# view model summary
print(model.summary())

In [None]:
#train the model

for epoch in range(1, 100):
    loss = 0
    for i, elem in enumerate(skip_grams):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        if i % 10000 == 0:
            print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
        loss += model.train_on_batch(X,Y)  

    print('Epoch:', epoch, 'Loss:', loss)

In [None]:
#get the embeding matrix
weights = model.get_weights()
## Save the wordvectors
f = open('skipgram_vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for key in vocab:
    str_vec = ' '.join(map(str, list(vectors[key, :])))
    f.write('{} {}\n'.format(vocab[key], str_vec))
f.close()

In [None]:
## Load the vectors back and validate
w2v = gensim.models.KeyedVectors.load_word2vec_format('./skipgram_vectors.txt', binary=False)
w2v.most_similar(positive=['the'])
w2v.most_similar(negative=['the'])

In [None]:
#Excerise: modeify the skipegram_model to share the same embeding layer between word and context
#Discussion: which is better? Why?  