# How to Train Your Own Word Vector Embeddings with Keras

### Loading Libraries

In [11]:
# !conda install -n MLAT tensorflow-gpu -y

In [12]:
# !conda install -n MLAT tensorflow -y

In [1]:
# Numerical Computing
import numpy as np
from numpy.random import choice

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Warnings
import warnings

# Time, Collection & Path
from time import time
from pathlib import Path
from pprint import pprint
import os, tarfile, sys, json
from collections import Counter

# SpaCy
import spacy
from spacy.lang.en import English

# Gensim
# from gensim.models.word2vec import LineSentence
# from gensim.models.phrases import Phrases, Phraser

# TensorFlow
# import tensorflow as tf
# from tensorflow.keras.models import Model
# from tensorflow.keras.callbacks import Callback, TensorBoard
# from tensorflow.keras.layers import Input, Dense, Reshape, Dot, Embedding
# from tensorflow.keras.preprocessing.sequence import skipgrams, make_sampling_table

#


In [2]:
# gpu_devices = tf.config.experimental.list_physical_devices('GPU')

# if gpu_devices:
#     print('Using GPU')
#     tf.config.experimental.set_memory_growth(gpu_devices[0], True)
# else:
#     print('Using CPU')

In [3]:
np.random.seed(42)

sns.set_style('white')

In [4]:
results_path = Path('results', 'financial_news')

In [5]:
analogy_path = Path('data', 'analogies-en.txt')

In [6]:
def format_time(t):
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    return f'{h:02.0f}:{m:02.0f}:{s:02.0f}'

### `word2vec` - skipgram Architecture using Keras

#### Settings

In [7]:
LANGUAGE = 'en'

SAMPLE_SIZE=.5    

NGRAMS = 3 

MIN_FREQ = 10

In [8]:
SAMPLING_FACTOR = 1e-4

WINDOW_SIZE = 3

EMBEDDING_SIZE = 300

EPOCHS = 1

BATCH_SIZE = 2500

In [9]:
VALID_SET = 10      
VALID_WINDOW = 150  
NN = 10             

valid_examples = np.random.choice(VALID_WINDOW, size=VALID_SET, replace=False)

In [10]:
FILE_NAME = f'articles_{NGRAMS}_grams.txt'

file_path = results_path / FILE_NAME

In [11]:
tb_path = results_path / 'tensorboard'

if not tb_path.exists():
    tb_path.mkdir(parents=True, exist_ok=True)

### Building Data Set

In [13]:
sentences = file_path.read_text().split('\n')

n = len(sentences)

In [14]:
max_length = 50

In [16]:
sentences = [s for s in sentences if len(s.split()) <= max_length]

In [17]:
print(f'Removed {n-len(sentences):,.0f} sentences containing more than {max_length} tokens')

In [18]:
words = ' '.join(np.random.choice(sentences, size=int(.5*len(sentences)), replace=False)).split()

In [19]:
vocab_size = len(token_to_id)

In [20]:
vocab_size

In [21]:
s = pd.Series(data).value_counts().reset_index()

s.columns = ['id', 'count']
s['token'] = s.id.map(id_to_token)

In [22]:
s.sort_values('count', ascending=False).head(10)

In [23]:
s['count'].sum()

In [24]:
s.sort_values('id').token.dropna().to_csv(tb_path / 'meta.tsv', index=False)

#### Analogies to ID

In [25]:
def get_analogies():
    df = pd.read_csv(analogy_path, header=None, squeeze=True)
    categories = df[df.str.startswith(':')]
    analogies = df[~df.str.startswith(':')].str.split(expand=True)
    analogies.columns = list('abcd')
    return analogies

In [26]:
analogies = get_analogies()

analogies.head()

In [27]:
analogies_id = analogies.apply(lambda x: x.map(token_to_id))

analogies_id.notnull().all(1).sum()/len(analogies_id)

### Generate Sampling Probabilities

In [28]:
df = s['count'].to_frame('freq')

factors = [1, 1e-2, 1e-4, 1e-6, 1e-8]

for f in factors:
    sf = make_sampling_table(vocab_size, sampling_factor=f)
    df[f] = df.freq.mul(sf)
df.loc[:, factors].plot(logy=True, xlim=(0, 60000));

In [29]:
sampling_table = make_sampling_table(vocab_size, sampling_factor=SAMPLING_FACTOR/10)

In [30]:
pd.Series(sampling_table).plot(title='Skip-Gram Sampling Probabilities')
plt.tight_layout();

#### Generating Target-Context Word Pairs

In [31]:
pairs, labels = skipgrams(sequence=data,
                          vocabulary_size=vocab_size,
                          window_size=WINDOW_SIZE,
                          sampling_table=sampling_table,
                          negative_samples=1.0,
                          shuffle=True)

print('{:,d} pairs created'.format(len(pairs)))

In [32]:
target_word, context_word = np.array(pairs, dtype=np.int32).T
labels = np.array(labels, dtype=np.int8)

del pairs

In [33]:
target_word[:5]

In [34]:
df = pd.DataFrame({'target': target_word[:5], 
                   'context': context_word[:5], 
                   'label': labels[:5]})

df

In [35]:
pd.Series(labels).value_counts()

In [36]:
with pd.HDFStore(results_path / 'data.h5') as store:
    store.put('id_to_token', pd.Series(id_to_token))
    store.put('pairs', pd.DataFrame({'target' : target_word,
                                     'context': context_word, 
                                     'labels': labels}))

In [37]:
with pd.HDFStore(results_path / 'data.h5') as store:
    id_to_token = store['id_to_token']
    pairs = store['pairs']

target_word, context_word, labels = pairs.target, pairs.context, pairs.labels

### Defining Keras Model Components

#### Scalar Input Variables

In [38]:
input_target = Input((1,), name='target_input')

input_context = Input((1,), name='context_input')

#### Shared Embedding Layer

In [39]:
embedding = Embedding(input_dim=vocab_size,
                      output_dim=EMBEDDING_SIZE,
                      input_length=1,
                      name='embedding_layer')

In [40]:
target = embedding(input_target)
target = Reshape((EMBEDDING_SIZE, 1), name='target_embedding')(target)

context = embedding(input_context)
context = Reshape((EMBEDDING_SIZE, 1), name='context_embedding')(context)

#### Creating Similarity Measure

In [41]:
dot_product = Dot(axes=1)([target, context])

dot_product = Reshape((1,), name='similarity')(dot_product)

#### Sigmoid Output Layer

In [42]:
output = Dense(units=1, activation='sigmoid', name='output')(dot_product)

#### Compiling Training Model

In [43]:
model = Model(inputs=[input_target, input_context], outputs=output)

model.compile(loss='binary_crossentropy', optimizer='rmsprop')

#### Displaying Architecture

In [44]:
model.summary()

#### Validation Model

In [45]:
similarity = Dot(normalize=True, 
                 axes=1, 
                 name='cosine_similarity')([target, context])

In [46]:
validation_model = Model(inputs=[input_target, input_context], outputs=similarity)

In [47]:
validation_model.summary()

### Creating Keras Callbacks

#### Nearest Neighors & Analogies

In [48]:
test_set = analogies_id.dropna().astype(int)

a, b, c, actual = test_set.values.T

actual = actual.reshape(-1, 1)

n_analogies = len(actual)

In [50]:
class EvalCallback(Callback):
        
    def on_train_begin(self, logs={}):
        self.eval_nn()
        self.test_analogies()

    def on_train_end(self, logs={}):
        self.eval_nn()

    def on_epoch_end(self, batch, logs={}):
        self.test_analogies()

    @staticmethod
    def test_analogies():
        print('\nAnalogy Accuracy:\n\t', end='')
        embeddings = embedding.get_weights()[0]
        target = embeddings[c] + embeddings[b] - embeddings[a]
        neighbors = np.argsort(cdist(target, embeddings, metric='cosine'))
        match_id = np.argwhere(neighbors == actual)[:, 1]
        print('\n\t'.join(['Top {}: {:.2%}'.format(i, (match_id < i).sum() / n_analogies) for i in [1, 5, 10]]))

    def eval_nn(self):
        print('\n{} Nearest Neighbors:'.format(NN))
        for i in range(VALID_SET):
            valid_id = valid_examples[i]
            valid_word = id_to_token[valid_id]
            similarity = self._get_similiarity(valid_id).reshape(-1)
            nearest = (-similarity).argsort()[1:NN + 1]
            neighbors = [id_to_token[nearest[n]] for n in range(NN)]
            print('{}:\t{}'.format(valid_word, ', '.join(neighbors)))            
        
    @staticmethod
    def _get_similiarity(valid_word_idx):
        target = np.full(shape=vocab_size, fill_value=valid_word_idx)
        context = np.arange(vocab_size)
        return validation_model.predict([target, context])


evaluation = EvalCallback()

#### Tensorboard Callback

In [51]:
tensorboard = TensorBoard(log_dir=str(tb_path),
                          write_graph=True,
                          embeddings_freq=1,
                          embeddings_metadata={'embedding_layer': 
                                               str(tb_path / 'meta.tsv')})

### Training Model

In [52]:
loss = model.fit(x=[target_word, context_word],
                 y=labels,
                 shuffle=True,
                 batch_size=BATCH_SIZE,
                 epochs=EPOCHS,
                 #callbacks=[evaluation, tensorboard] # uncomment if tensorboard bug is fixed
                 callbacks=[evaluation]
                )

In [53]:
model.save(str(results_path / 'skipgram_model.h5'))

### Visualizing Embeddings using Tensorboard

In [54]:
%load_ext tensorboard

In [55]:
%tensorboard --logdir results/financial_news/tensorboard/train