In [1]:
import numpy as np
import gensim
import keras.backend as K
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers import Dense, Embedding, Lambda
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

In [2]:
data=open('corona.txt', 'r')
corona_data = [text for text in data if text.count(' ') >= 2]
tokenize = Tokenizer()

In [3]:
tokenize.fit_on_texts(corona_data)
corona_data = tokenize.texts_to_sequences(corona_data)

In [4]:
total_vocab = sum(len(s) for s in corona_data)
total_vocab

198

In [5]:
word_count = len(tokenize.word_index) + 1
window_size = 2

In [6]:
def cbow_model(data, window_size, total_vocab):
    total_len = window_size*2
    for text in data:
        text_len = len(text)
        for idx, word in enumerate(text):
            context_w =[]
            target = []
            
            begin = idx - window_size
            end = idx + window_size + 1
            
            context_w.append([text(i) for i in range(begin, end) if 0 <= i < len(text) and i!=idx])
            target.append(word)
            
            c = sequence.pad_sequences(context_w, total_length = total_len)
            final_target = np_utils.to_categorical(target, total_vocab)
            
            yield(c, final_target)

In [7]:
model = Sequential()
model.add(Embedding(input_dim = total_vocab, output_dim = 100, input_length = window_size*2))
model.add(Lambda(lambda x:K.mean(x, axis =1), output_shape=(100,)))
model.add(Dense(total_vocab, activation = 'softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            19800     
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 198)               19998     
                                                                 
Total params: 39,798
Trainable params: 39,798
Non-trainable params: 0
_________________________________________________________________


In [8]:
model.compile(loss='categorical_crossentropy', optimizer = 'adam')

In [9]:
testfile = open('vectors.text', 'w')
testfile.write('{} {}\n'.format(total_vocab, 100))

8

In [10]:
weights = model.get_weights()[0]
weights

array([[ 0.00507251,  0.0244944 ,  0.00256344, ..., -0.00014775,
        -0.02308915,  0.01272514],
       [-0.01192098,  0.04134912,  0.00154469, ...,  0.02686591,
         0.00136548,  0.00067942],
       [-0.01928465,  0.01018773,  0.03719164, ...,  0.00619762,
         0.00544707, -0.00177746],
       ...,
       [-0.01549976, -0.04142914,  0.00382031, ...,  0.04257217,
        -0.01160747,  0.00997703],
       [-0.00408662,  0.02358354, -0.0170808 , ..., -0.04474369,
         0.04081687,  0.01577801],
       [ 0.01908514, -0.04749229, -0.04895231, ...,  0.0428634 ,
        -0.0422344 ,  0.02018325]], dtype=float32)

In [11]:
for text, i in tokenize.word_index.items():
    final_vec = ' '.join(map(str, list(weights[i, :])))
    testfile.write('{} {}\n'.format(text, final_vec))
testfile.close()

In [14]:
op = gensim.models.KeyedVectors.load_word2vec_format('vectors.txt', binary = False, limit=100)

In [17]:
op.most_similar(positive=['virus'])

[('number', 0.30579182505607605),
 ('reproductive', 0.22632810473442078),
 ('has', 0.21565236151218414),
 ('symptomatic', 0.16818612813949585),
 ('higher', 0.16382279992103577),
 ('prior', 0.1621086299419403),
 ('days', 0.14873236417770386),
 ('making', 0.14731886982917786),
 ('direct', 0.1466849446296692),
 ('cases', 0.14031724631786346)]