# How to Evaluate Embeddings: Using Linear Algebra & Analogies

### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Warnings
import warnings

# Time, Collection & Path
from time import time
from collections import Counter
from pathlib import Path

In [2]:
np.random.seed(42)

sns.set_style('white')

pd.set_option('float_format', '{:,.2f}'.format)

In [3]:
analogy_path = Path('data', 'analogies-en.txt')

In [4]:
def format_time(t):
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    return f'{h:02.0f}:{m:02.0f}:{s:02.0f}'

### Evaluation: `Analogies`

In [6]:
df = pd.read_csv(analogy_path, header=None, names=['category'], squeeze=True)

categories = df[df.str.startswith(':')]

analogies = df[~df.str.startswith(':')].str.split(expand=True)
analogies.columns = list('abcd')

In [7]:
df = pd.concat([categories, analogies], axis=1)

df.category = df.category.ffill()

df = df[df['a'].notnull()]

df.head()

In [8]:
analogy_cnt = df.groupby('category').size().sort_values(ascending=False).to_frame('n')

analogy_example = df.groupby('category').first()

In [9]:
analogy_cnt.join(analogy_example)

In [10]:
analogy_cnt.join(analogy_example)['n'].sort_values().plot.barh(title='# Analogies by Category',
                                                               figsize=(14, 6))

sns.despine()
plt.tight_layout()
plt.show()