In [2]:
!pip install keras
!pip install tensorflow
!pip install plot_keras_history
!pip install seaborn

Collecting plot_keras_history
  Downloading https://files.pythonhosted.org/packages/2e/5a/8d60eae5d2624877fba35b63b70cea4edcf603c75883c004bb4715470d10/plot_keras_history-1.1.23.tar.gz
Collecting sanitize_ml_labels
  Downloading https://files.pythonhosted.org/packages/69/2d/4db7caa3f08982c2ea48f6d6c19079246403a8742c14ea70875f48569dd6/sanitize_ml_labels-1.0.12.tar.gz
Collecting compress_json
  Downloading https://files.pythonhosted.org/packages/c0/9d/1a79dbc3b9d69c57be363b4cc8bd6f278f919a0973f46c7bb831438c9333/compress_json-1.0.4.tar.gz
Building wheels for collected packages: plot-keras-history, sanitize-ml-labels, compress-json
  Building wheel for plot-keras-history (setup.py) ... [?25l[?25hdone
  Created wheel for plot-keras-history: filename=plot_keras_history-1.1.23-cp36-none-any.whl size=6405 sha256=21d260131d4082ea0ec435c0a4275ca5e055740d7f8df33ce25cf901e26663e8
  Stored in directory: /root/.cache/pip/wheels/c0/78/33/da5ed769fab5587fcdae95271e8d19106e3b92b3ae2d46382d
  Building 

In [4]:

from keras.utils import np_utils
from keras.preprocessing import sequence

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda, Reshape
from keras.layers import Input
from keras.models import Model
from keras.layers import dot
from tensorflow.keras.activations import relu
from nltk import word_tokenize, sent_tokenize
from gensim.corpora.dictionary import Dictionary
import numpy as np

from keras.preprocessing.sequence import skipgrams
import gensim


In [5]:
# using nltk tokenizer.  
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
#Data Preparation 

AlotOftext = """Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in corpora, 
the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish that it is not true. In
corpus studies, we frequently do have enough data, so the fact that a relation 
between two phenomena is demonstrably non-random, does not support the inference 
that it is not arbitrary. We present experimental evidence
of how arbitrary associations between word frequencies and corpora are
systematically non-random. We review literature in which hypothesis testing 
has been used, and show how it has often led to unhelpful or misleading results.""".lower()



#Tokenize text
tokenized_text = [word_tokenize(sent) for sent in sent_tokenize(AlotOftext)]

#Create Vocab as a Dictionary
vocab = Dictionary(tokenized_text)
print(dict(vocab.items()))

print(vocab.token2id['corpora'])
print(vocab[2])
sent0 = tokenized_text[0]
print(vocab.doc2idx(sent0))

vocab.add_documents([['PAD']])
dict(vocab.items())
print(vocab.token2id['PAD'])

corpusByWordID = list()
for sent in  tokenized_text:
    corpusByWordID.append(vocab.doc2idx(sent))

vocab_size = len(vocab)
embed_size = 100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(vocab.items())[:10])

{0: ',', 1: '.', 2: 'and', 3: 'choose', 4: 'essentially', 5: 'is', 6: 'language', 7: 'never', 8: 'non-random', 9: 'randomly', 10: 'users', 11: 'words', 12: 'a', 13: 'hypothesis', 14: 'null', 15: 'posits', 16: 'randomness', 17: 'statistical', 18: 'testing', 19: 'uses', 20: 'which', 21: 'at', 22: 'be', 23: 'corpora', 24: 'hence', 25: 'in', 26: 'linguistic', 27: 'look', 28: 'phenomena', 29: 'the', 30: 'true', 31: 'we', 32: 'when', 33: 'will', 34: '(', 35: ')', 36: 'able', 37: 'almost', 38: 'always', 39: 'data', 40: 'enough', 41: 'establish', 42: 'it', 43: 'moreover', 44: 'not', 45: 'shall', 46: 'that', 47: 'there', 48: 'to', 49: 'where', 50: 'arbitrary', 51: 'between', 52: 'corpus', 53: 'demonstrably', 54: 'do', 55: 'does', 56: 'fact', 57: 'frequently', 58: 'have', 59: 'inference', 60: 'relation', 61: 'so', 62: 'studies', 63: 'support', 64: 'two', 65: 'are', 66: 'associations', 67: 'evidence', 68: 'experimental', 69: 'frequencies', 70: 'how', 71: 'of', 72: 'present', 73: 'systematically',

In [8]:
# Create CBOW Training data
def generate_cbow_context_word_pairs(corpusByID, window_size, vocab_size):
    context_length = window_size*2
    X=[]
    Y=[]
    for sent in corpusByID:
        sentence_length = len(sent)
        for index, word in enumerate(sent):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([sent[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)
            if start<0:
                x = sequence.pad_sequences(context_words, maxlen=context_length,padding='pre',value=vocab.token2id['PAD'])
                y = np_utils.to_categorical(label_word, vocab_size)
                X.append(x)
                Y.append(y)
                continue
            if end>=sentence_length:
                x = sequence.pad_sequences(context_words, maxlen=context_length,padding='post',value=vocab.token2id['PAD'])
                y = np_utils.to_categorical(label_word, vocab_size)
                X.append(x)
                Y.append(y)
                continue
            else:
                X.append(sequence.pad_sequences(context_words, maxlen=context_length))
                Y.append(y)
                continue
           
    return X,Y
            
# Test this out for some samples


X,Y = generate_cbow_context_word_pairs(corpusByWordID, window_size, vocab_size) 
   
for x, y in zip(X,Y):
    print('Context (X):', [vocab[w] for w in x[0]], '-> Target (Y):', vocab[np.argwhere(y[0])[0][0]])


Context (X): ['PAD', 'PAD', 'users', 'never'] -> Target (Y): language
Context (X): ['PAD', 'language', 'never', 'choose'] -> Target (Y): users
Context (X): ['language', 'users', 'choose', 'words'] -> Target (Y): users
Context (X): ['users', 'never', 'words', 'randomly'] -> Target (Y): users
Context (X): ['never', 'choose', 'randomly', ','] -> Target (Y): users
Context (X): ['choose', 'words', ',', 'and'] -> Target (Y): users
Context (X): ['words', 'randomly', 'and', 'language'] -> Target (Y): users
Context (X): ['randomly', ',', 'language', 'is'] -> Target (Y): users
Context (X): [',', 'and', 'is', 'essentially'] -> Target (Y): users
Context (X): ['and', 'language', 'essentially', 'non-random'] -> Target (Y): users
Context (X): ['language', 'is', 'non-random', '.'] -> Target (Y): essentially
Context (X): ['is', 'essentially', '.', 'PAD'] -> Target (Y): non-random
Context (X): ['essentially', 'non-random', 'PAD', 'PAD'] -> Target (Y): .
Context (X): ['PAD', 'PAD', 'hypothesis', 'testing

In [14]:
#define the model
cbow = Sequential()
###hint:output_dim = the shape of embedding matrix
###hint:input_length = the length of training sample
cbow.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=1000))
cbow.add(Lambda(lambda x: relu(K.mean(x, axis=1)), output_shape=(embed_size,)))
###hint:the total numbser of possible labels/words
###hint:activation='softmax' or 'sigmoid'
cbow.add(Dense(88, activation='softmax'))
###hint:loss='categorical_crossentropy' or 'binary_crossentropy'
cbow.compile(loss='categorical_crossentropy', optimizer='sgd')
cbow.summary()



Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 100)         8800      
_________________________________________________________________
lambda_2 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 88)                8888      
Total params: 17,688
Trainable params: 17,688
Non-trainable params: 0
_________________________________________________________________


In [15]:
#Train the model

for epoch in range(1000):
    loss = 0.
    for x, y in zip(X,Y):
        loss += cbow.train_on_batch(x, y)
    print(epoch, loss)

0 631.4400882720947
1 604.3475439548492
2 578.5113105773926
3 554.1824760437012
4 531.6189250946045
5 511.0349793434143
6 492.5742378234863
7 476.2855553627014
8 462.10508823394775
9 449.8623617887497
10 439.3364369869232
11 430.2709106206894
12 422.4196993112564
13 415.5548372268677
14 409.48642122745514
15 404.0564351081848
16 399.1408244371414
17 394.63965129852295
18 390.47264075279236
19 386.5723046064377
20 382.8869435787201
21 379.3732533454895
22 375.9937798976898
23 372.7201807498932
24 369.52367520332336
25 366.38195073604584
26 363.2725887298584
27 360.18074691295624
28 357.088960647583
29 353.9818605184555
30 350.8458740711212
31 347.6740506887436
32 344.44955348968506
33 341.1670643091202
34 337.8167779445648
35 334.3973294496536
36 330.8989690542221
37 327.3278577327728
38 323.6788774728775
39 319.9549552202225
40 316.15971636772156
41 312.29038083553314
42 308.3603159189224
43 304.3673740029335
44 300.32521879673004
45 296.2412641644478
46 292.1251283288002
47 287.978543

In [16]:
## Save the wordvectors
f = open('Cbow_vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = cbow.get_weights()[0]
for key in vocab:
    str_vec = ' '.join(map(str, list(vectors[key, :])))
    f.write('{} {}\n'.format(vocab[key], str_vec))
f.close()

In [18]:
## Load the vectors back and validate
w2v = gensim.models.KeyedVectors.load_word2vec_format('./Cbow_vectors.txt', binary=False)

w2v.most_similar(positive=['that'])
w2v.most_similar(negative=['that'])

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  if np.issubdtype(vec.dtype, np.int):


[('essentially', 0.41536492109298706),
 ('how', 0.3250637650489807),
 ('and', 0.32494449615478516),
 ('used', 0.29641997814178467),
 ('literature', 0.2688229978084564),
 ('review', 0.26424601674079895),
 ('systematically', 0.2554593086242676),
 ('which', 0.23558300733566284),
 ('randomly', 0.23268665373325348),
 ('misleading', 0.22910144925117493)]

In [19]:
#Create Skipgram Training data 

# generate skip-grams with both positive and negative examples
skip_grams = [skipgrams(sent, vocabulary_size=vocab_size, window_size=2) for sent in corpusByWordID]

# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
        vocab[pairs[i][0]], pairs[i][0],           
        vocab[pairs[i][1]], pairs[i][1], 
        labels[i]))

(never (7), been (75)) -> 0
(randomly (9), choose (3)) -> 0
(essentially (4), non-random (8)) -> 1
(non-random (8), essentially (4)) -> 1
(essentially (4), language (6)) -> 1
(randomly (9), are (65)) -> 0
(users (10), hence (24)) -> 0
(words (11), testing (18)) -> 0
(language (6), linguistic (26)) -> 0
(never (7), does (55)) -> 0


In [24]:
#define the skip-gram model

#define the model
input_word = Input((1,))
input_context_word = Input((1,))

word_embedding = Embedding(input_dim=vocab_size, output_dim=100,input_length=1,name='word_embedding')
context_embedding = Embedding(input_dim=vocab_size, output_dim=100,input_length=1,name='conotext_embedding')

word_embedding = word_embedding(input_word)
word_embedding_layer = Reshape((embed_size, 1))(word_embedding)

context_embedding = context_embedding(input_context_word)
context_embedding_layer = Reshape((embed_size, 1))(context_embedding)

# now perform the dot product operation word_embedding_vec * context_embedding_vec
dot_product = dot([word_embedding_layer, context_embedding_layer], axes=1)
dot_product = Reshape((1,))(dot_product)

###hint:the total number of possible labels/words
###hint:activation='softmax' or 'sigmoid'
outputLayer = Dense(1, activation='sigmoid')(dot_product)

model = Model(inputs=[input_word, input_context_word], outputs=outputLayer)

###hint:loss='categorical_crossentropy' or 'binary_crossentropy'
model.compile(loss='binary_crossentropy', optimizer='adam')

# view model summary
print(model.summary())

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 1, 100)       8800        input_7[0][0]                    
__________________________________________________________________________________________________
conotext_embedding (Embedding)  (None, 1, 100)       8800        input_8[0][0]                    
_______________________________________________________________________________________

In [25]:
#train the model

for epoch in range(1, 100):
    loss = 0
    for i, elem in enumerate(skip_grams):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        if i % 10000 == 0:
            print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
        loss += model.train_on_batch(X,Y)  

    print('Epoch:', epoch, 'Loss:', loss)

Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 1 Loss: 4.85259735584259
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 2 Loss: 4.843343555927277
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 3 Loss: 4.834865927696228
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 4 Loss: 4.825725853443146
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 5 Loss: 4.81542044878006
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 6 Loss: 4.803482413291931
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 7 Loss: 4.789441525936127
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 8 Loss: 4.772817015647888
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 9 Loss: 4.753122687339783
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 10 Loss: 4.72987025976181
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 11 Loss: 4.702575325965881
Processed 0 (skip_first, skip_sec

In [28]:
#get the embeding matrix
weights = model.get_weights()
## Save the wordvectors
f = open('skipgram_vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for key in vocab:
    str_vec = ' '.join(map(str, list(vectors[key, :])))
    f.write('{} {}\n'.format(vocab[key], str_vec))
f.close()

In [29]:
## Load the vectors back and validate
w2v = gensim.models.KeyedVectors.load_word2vec_format('./skipgram_vectors.txt', binary=False)
w2v.most_similar(positive=['the'])
w2v.most_similar(negative=['the'])

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  if np.issubdtype(vec.dtype, np.int):


[('associations', 0.26163050532341003),
 ('has', 0.2538308799266815),
 ('when', 0.23669277131557465),
 ('not', 0.22886672616004944),
 ('non-random', 0.2182091474533081),
 ('word', 0.21514591574668884),
 ('of', 0.20937611162662506),
 ('how', 0.19063930213451385),
 ('be', 0.17910176515579224),
 ('results', 0.17082341015338898)]

In [None]:
#Excerise: modeify the skipegram_model to share the same embeding layer between word and context
#Discussion: which is better? Why?  