# How to Train Your Own Word Vector Embeddings Using Gensim

### Loading Libraries

In [2]:
# Numerical Computing
import numpy as np
from numpy.linalg import norm

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Warnings
import warnings

# Path, Time & Collection
from time import time
from collections import Counter
from pathlib import Path

# SciPy
from scipy.spatial.distance import cdist, cosine

# Gensim
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.word2vec import LineSentence

# Scikit-Learn
from sklearn.decomposition import IncrementalPCA

In [3]:
np.random.seed(42)

sns.set_style('whitegrid')

warnings.filterwarnings('ignore')

pd.set_option('float_format', '{:,.2f}'.format)

In [4]:
news_path = Path('data', 'fin_news')

data_path = news_path / 'data'

analogy_path = Path('data', 'analogies-en.txt')

In [5]:
def format_time(t):
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    return f'{h:02.0f}:{m:02.0f}:{s:02.0f}'

### Model Configuration

In [6]:
gensim_path = news_path / 'gensim'

if not gensim_path.exists():
    gensim_path.mkdir(parents=True, exist_ok=True)

In [7]:
NGRAMS = 3           
MIN_FREQ = 100
WINDOW_SIZE = 5
EMBEDDING_SIZE = 300
NEGATIVE_SAMPLES = 20
EPOCHS = 1

In [8]:
FILE_NAME = f'articles_{NGRAMS}_grams.txt'

### Sentence Generator

In [10]:
sentence_path = data_path / FILE_NAME

sentences = LineSentence(str(sentence_path))

#### Training `word2vec` Model

In [11]:
start = time()

model = Word2Vec(sentences, 
                 sg=1, 
                 size=EMBEDDING_SIZE, 
                 window=WINDOW_SIZE,
                 min_count=MIN_FREQ, 
                 negative=NEGATIVE_SAMPLES, 
                 workers=8,
                 iter=EPOCHS, 
                 alpha=0.05)


model.save(str(gensim_path / 'word2vec.model'))


model.wv.save(str(gensim_path / 'word_vectors.bin'))
print('Duration:', format_time(time() - start))

### Evaluating Results

In [12]:
cat_dict = {'capital-common-countries':'Capitals',
            'capital-world':'Capitals RoW',
            'city-in-state':'City-State',
            'currency':'Currency',
            'family':'Famliy',
            'gram1-adjective-to-adverb':'Adj-Adverb',
            'gram2-opposite':'Opposite',
            'gram3-comparative':'Comparative',
            'gram4-superlative':'Superlative',
            'gram5-present-participle':'Pres. Part.',
            'gram6-nationality-adjective':'Nationality',
            'gram7-past-tense':'Past Tense',
            'gram8-plural':'Plural',
            'gram9-plural-verbs':'Plural Verbs',
            'total':'Total'}

In [13]:
def accuracy_by_category(acc, detail=True):
    results = [[c['section'], len(c['correct']), len(c['incorrect'])] for c in acc]
    results = pd.DataFrame(results, columns=['category', 'correct', 'incorrect'])
    results['average'] = results.correct.div(results[['correct', 'incorrect']].sum(1))
    if detail:
        print(results.sort_values('average', ascending=False))
    return results.loc[results.category=='total', ['correct', 'incorrect', 'average']].squeeze().tolist()

In [15]:
detailed_accuracy = model.wv.accuracy(analogy_path.as_posix(), case_insensitive=True)

In [16]:
summary = accuracy_by_category(detailed_accuracy)

print('Base Accuracy: Correct {:,.0f} | Wrong {:,.0f} | Avg {:,.2%}\n'.format(*summary))

In [17]:
most_sim = model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=20)

pd.DataFrame(most_sim, columns=['token', 'similarity'])

In [18]:
counter = Counter(sentence_path.read_text().split())

In [19]:
most_common = pd.DataFrame(counter.most_common(), columns=['token', 'count'])

most_common = most_common[most_common['count']> MIN_FREQ]

most_common['p'] = np.log(most_common['count'])/np.log(most_common['count']).sum()

In [20]:
similars = pd.DataFrame()

for token in np.random.choice(most_common.token, size=10, p=most_common.p):
    similars[token] = [s[0] for s in model.wv.most_similar(token)]
similars.T

### Keeping Training

In [21]:
accuracies = [summary]

best_accuracy = summary[-1]

for i in range(1, 10):
    start = time()
    model.train(sentences, epochs=1, total_examples=model.corpus_count)
    detailed_accuracy = model.wv.accuracy(analogy_path)
    accuracies.append(accuracy_by_category(detailed_accuracy, detail=False))
    print(f'{i:02} | Duration: {format_time(time() - start)} | Accuracy: {accuracies[-1][-1]:.2%} ')
    if accuracies[-1][-1] > best_accuracy:
        model.save(str(gensim_path / f'word2vec_{i:02}.model'))
        model.wv.save(str(gensim_path / f'word_vectors_{i:02}.bin'))
        best_accuracy = accuracies[-1][-1]
    (pd.DataFrame(accuracies, 
                 columns=['correct', 'wrong', 'average'])
     .to_csv(gensim_path / 'accuracies.csv', index=False))
model.wv.save(str(gensim_path / 'word_vectors_final.bin'))

### Evaluating Best Model

In [22]:
pd.DataFrame(accuracies, columns=['correct', 'wrong', 'average'], index=list(range(1, len(accuracies) + 1))).average.plot();

In [23]:
best_model = Word2Vec.load((gensim_path / 'word2vec_06.model').as_posix())

In [24]:
detailed_accuracy = best_model.wv.accuracy(analogy_path.as_posix(), case_insensitive=True)

In [25]:
summary = accuracy_by_category(detailed_accuracy)

print('Base Accuracy: Correct {:,.0f} | Wrong {:,.0f} | Avg {:,.2%}\n'.format(*summary))

In [26]:
results = [[c['section'], len(c['correct']), len(c['incorrect'])] for c in detailed_accuracy]
results = pd.DataFrame(results, columns=['category', 'correct', 'incorrect'])
results['category'] = results.category.map(cat_dict)
results['average'] = results.correct.div(results[['correct', 'incorrect']].sum(1))
results = results.rename(columns=str.capitalize).set_index('Category')
total = results.loc['Total']
results = results.drop('Total')

In [27]:
most_sim = best_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=20)

pd.DataFrame(most_sim, columns=['token', 'similarity'])

In [28]:
fig, axes = plt.subplots(figsize=(16, 5), ncols=2)

axes[0] = results.loc[:, ['Correct', 'Incorrect']].plot.bar(stacked=True, ax=axes[0]
                                                           , title='Analogy Accuracy')
ax1 = results.loc[:, ['Average']].plot(ax=axes[0], secondary_y=True, lw=1, c='k', rot=35)
ax1.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y)))

(pd.DataFrame(most_sim, columns=['token', 'similarity'])
 .set_index('token').similarity
 .sort_values().tail(10).plot.barh(xlim=(.3, .37), ax=axes[1], title='Closest matches for Woman + King - Man'))
fig.tight_layout();
plt.show()

In [29]:
counter = Counter(sentence_path.read_text().split())

In [30]:
most_common = pd.DataFrame(counter.most_common(), columns=['token', 'count'])

most_common = most_common[most_common['count']> MIN_FREQ]

most_common['p'] = np.log(most_common['count'])/np.log(most_common['count']).sum()

In [31]:
similars = pd.DataFrame()

for token in np.random.choice(most_common.token, size=10, p=most_common.p):
    similars[token] = [s[0] for s in best_model.wv.most_similar(token)]
similars.T

In [32]:
similars.T.iloc[:5, :5].to_csv('figures/most_similar.csv')