In [1]:
from vocabulary import Vocabulary
from word2vec import Word2Vec
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
import re
from itertools import islice

In [2]:
path = './data/timemachine.txt'

if path is None:
    text =   ('Machine learning is the study of computer algorithms that ' +
              'improve automatically through experience. It is seen as a ' + 
              'subset of artificial intelligence. Machine learning algorithms '+ 
              'build a mathematical model based on sample data, known as '+ 
              'training data, in order to make predictions or decisions without ' + 
              'being explicitly programmed to do so. Machine learning algorithms '+ 
              'are used in a wide variety of applications, such as email filtering '+ 
              'and computer vision, where it is difficult or infeasible to develop '+
              'conventional algorithms to perform the needed tasks.')
    words = text.lower().split(' ')
    
else:
    with open(path, 'r', encoding='utf-8') as f:
        file = f.read().strip().lower().split('\n')
        
    words = []
    for line in file:
        words += line.split(' ')
    
w2v = Word2Vec(Vocabulary(path), dim=300, window_size=2, 
               lr=0.01, random_state=10, K=2, 
               distribute=False)
print(len(words))

55308


In [None]:
cost = []
max_words = 20_000
words = words[:max_words]

for j in range(50):
    total_loss = 0
    for i in range(w2v.window_size, len(words) - w2v.window_size):
        window = [words[i+j] for j in range(-w2v.window_size, w2v.window_size + 1) if j != 0]
        center_word = words[i]
        loss, n_s_i = w2v.forward(window, center_word)
        grads = w2v.backward(window, center_word, n_s_i)            
        w2v.update(grads)
        total_loss += loss 
        
    #total_loss /= len(words) - 2
    
    if not ((j + 1) % 10): print(f'Cost epoch {j}th: ', np.round(total_loss, decimals=3))
    cost.append(total_loss)

plt.plot(cost, '-o')
plt.show()

Cost epoch 9th:  71244.14
Cost epoch 19th:  59670.442


In [None]:
if path is not None:
    targets = ['time', 'travel', 'lamp', 'edge', 'found']
else:
    targets = ['machine', 'learning', 'data', 'algorithm', 'computer']
    
for target in targets:
    softmax = w2v.predict(target)
    indexes = np.argsort(softmax)[-5:][::-1].astype(int)

    for index in indexes:
        print(softmax[index], w2v.vocabulary[index])

    probable_words = [w2v.vocabulary[index] for index in indexes]
    print(target)
    print(probable_words)
    print('-' * 100)
    print('')

In [None]:
pca = PCA(n_components=2)
u_pca = pca.fit_transform(w2v.U.T)

text = [w2v.vocabulary.index_to_word[key] for key in np.arange(0, len(w2v.vocabulary)-1)]
text = text[:50]


%matplotlib inline

plt.figure(figsize=(25, 10))
for i, (x, y) in enumerate(u_pca):
    if i >= len(text):
        break
    else:
        plt.text(x, y, s=text[i], 
                 fontsize=30)

plt.axis('off')
plt.show()