In [1]:
def read_analogy_file(file_path, include_transforms=True):
    analogies = []
    with open(file_path) as f:
        for line in f:
            analogy = line.split()
            analogies.append([a.split(sep=':') for a in analogy])
            if include_transforms:
                analogies.append([list(reversed(a.split(sep=':'))) for a in analogy])
                analogies.append([a.split(sep=':') for a in reversed(analogy)])
                analogies.append([list(reversed(a.split(sep=':'))) for a in reversed(analogy)])
    return analogies

In [2]:
analogies = read_analogy_file('style_analogies.txt')
analogies

[[['man', 'gentleman'], ['woman', 'lady']],
 [['gentleman', 'man'], ['lady', 'woman']],
 [['woman', 'lady'], ['man', 'gentleman']],
 [['lady', 'woman'], ['gentleman', 'man']],
 [['mom', 'mother'], ['dad', 'father']],
 [['mother', 'mom'], ['father', 'dad']],
 [['dad', 'father'], ['mom', 'mother']],
 [['father', 'dad'], ['mother', 'mom']],
 [['funny', 'comical'], ['happy', 'joyful']],
 [['comical', 'funny'], ['joyful', 'happy']],
 [['happy', 'joyful'], ['funny', 'comical']],
 [['joyful', 'happy'], ['comical', 'funny']],
 [['begin', 'commence'], ['before', 'prior']],
 [['commence', 'begin'], ['prior', 'before']],
 [['before', 'prior'], ['begin', 'commence']],
 [['prior', 'before'], ['commence', 'begin']],
 [['bug', 'insect'], ['snake', 'serpent']],
 [['insect', 'bug'], ['serpent', 'snake']],
 [['snake', 'serpent'], ['bug', 'insect']],
 [['serpent', 'snake'], ['insect', 'bug']],
 [['stop', 'finish'], ['start', 'commence']],
 [['finish', 'stop'], ['commence', 'start']],
 [['start', 'commenc

In [3]:
import numpy as np

def test_analogy(model, analogy, topn=10):
    results = model.most_similar(positive=[analogy[0][1], analogy[1][0]], negative=[analogy[0][0]], topn=topn)
    words = [w for w,_ in results]
    return (analogy[1][1] in words)

def get_accuracy(model, analogies, topn=10):
    results = np.zeros(len(analogies))
    not_found = 0
    for idx, analogy in enumerate(analogies):
        words = [w for l in analogy for w in l if w in model]
        if len(words) == 4:
            results[idx] = test_analogy(model, analogy, topn)
        else:
            not_found += 1
    return np.sum(results) / len(analogies), not_found

def print_stats_report(model, analogies):
    num_analogies = len(analogies)
    top10, _ = get_accuracy(model, analogies, 10)
    n_correct10 = int(top10 * num_analogies)
    top5, _ = get_accuracy(model, analogies, 5)
    n_correct5 = int(top5 * num_analogies)
    top1, not_found = get_accuracy(model, analogies, 5)
    n_correct = int(top1 * num_analogies)
    
    num_analogies -= not_found
    
    print(f"Number of Analogies: {num_analogies}\n ({not_found} not in model)")
    print(f"Correct answer is most similar: {top1:.3f}% ({n_correct} of {num_analogies})")
    print(f"Correct answer in top 5 most similar: {top5:.3f}% ({n_correct5} of {num_analogies})")
    print(f"Correct answer in top 10 most similar: {top10:.3f}% ({n_correct10} of {num_analogies})")



In [4]:
import gensim
import gensim.downloader as gdl

# download pretrained 100D glove vectors
pretrained_glove = gdl.load('glove-wiki-gigaword-100')

print_stats_report(pretrained_glove, analogies)

Number of Analogies: 48
 (4 not in model)
Correct answer is most similar: 0.308% (16 of 48)
Correct answer in top 5 most similar: 0.308% (16 of 48)
Correct answer in top 10 most similar: 0.365% (19 of 48)


In [5]:
print_stats_report(pretrained_glove, analogies)

Number of Analogies: 48
 (4 not in model)
Correct answer is most similar: 0.308% (16 of 48)
Correct answer in top 5 most similar: 0.308% (16 of 48)
Correct answer in top 10 most similar: 0.365% (19 of 48)


In [6]:
analogies = read_analogy_file('style_analogies.txt')
print_stats_report(pretrained_glove, analogies)

Number of Analogies: 48
 (4 not in model)
Correct answer is most similar: 0.308% (16 of 48)
Correct answer in top 5 most similar: 0.308% (16 of 48)
Correct answer in top 10 most similar: 0.365% (19 of 48)


In [7]:
analogies = read_analogy_file('germanic_latinate.txt')
print_stats_report(pretrained_glove, analogies)

Number of Analogies: 36
 (0 not in model)
Correct answer is most similar: 0.111% (4 of 36)
Correct answer in top 5 most similar: 0.111% (4 of 36)
Correct answer in top 10 most similar: 0.167% (6 of 36)
