In [1]:
import os
import sys
import time
import math
import argparse

import numpy as np
from scipy import stats
from gensim.models import KeyedVectors
from matplotlib import pyplot as plt

from gmm_tree import gmm_clustering

In [2]:
# Nov. 2013 dump of Wikipedia, only articles with at least 20 pageviews, leaving 460k documents. 
train_file = '/home/matthias/data/wikipedia.txt'
# Checkout and build https://github.com/mleimeister/fastText/tree/hs_precomputed_tree
fasttext = '/home/matthias/fastText/fasttext'
# Directory to store the trained vectors
save_path = '/home/matthias/tmp/'
# Questions file for the word analogy task
questions_file = '/home/matthias/data/questions-words.txt'
# File for word similarity task
dataPath = '/home/matthias/data/rw.txt'

In [3]:
# Set the dimensions of the embedding vectors, for each there will be a separate training and evaluation run
dims = [50, 100, 400]

In [4]:
def print_formated_time(elapsed, outfile=None):
    hours, rem = divmod(elapsed, 3600)
    minutes, seconds = divmod(rem, 60)
    s = 'Elapsed time: {:0>2}:{:0>2}:{:05.2f}'.format(int(hours),int(minutes),seconds)
    print(s)
    if outfile is not None:
        with open(outfile, 'a') as f:
            f.write('Elapsed time: {}\n'.format(s))

In [5]:
# Run cbow fastText using hierarchical softmax with Huffman tree
for dim in dims:
    out_vec_file = save_path + 'vecs_neg_' + str(dim)
    results_file = save_path + 'results_neg_' + str(dim) + '.txt'

    exec_str = fasttext + ' cbow -input ' + train_file + ' -output ' + out_vec_file + \
              ' -minCount 25 -minn 0 -maxn 0 -t 0.00001 -lr 0.05 -dim ' + str(dim) + ' -ws 10 ' + \
              '-epoch 3 -loss ns -neg 10 -thread 48'

    print('Running fastText in dimension {}'.format(dim))
    start = time.time()
    os.system(exec_str)
    elapsed = time.time() - start
    with open(results_file, 'w') as f:
        f.write(exec_str + '\n')
    print_formated_time(elapsed, results_file)

Running fastText in dimension 50
Elapsed time: 00:02:14.27
Running fastText in dimension 100
Elapsed time: 00:02:43.26
Running fastText in dimension 400
Elapsed time: 00:07:13.09


In [6]:
# Evaluation using the word analogy task.
def print_accuracy(acc):
    sem_correct = sum((len(acc[i]['correct']) for i in range(5)))
    sem_total = sum((len(acc[i]['correct']) + len(acc[i]['incorrect'])) for i in range(5))
    sem_acc = float(sem_correct)/sem_total
    print('Semantic: {:d}/{:d}, Accuracy: {:.2f}%'.format(sem_correct, sem_total, 100*sem_acc))
    
    syn_correct = sum((len(acc[i]['correct']) for i in range(5, len(acc)-1)))
    syn_total = sum((len(acc[i]['correct']) + len(acc[i]['incorrect'])) for i in range(5,len(acc)-1))
    syn_acc = float(syn_correct)/syn_total
    print('Syntactic: {:d}/{:d}, Accuracy: {:.2f}%'.format(syn_correct, syn_total, 100*syn_acc))
    
    total_correct = sem_correct + syn_correct
    total = sem_total + syn_total
    total_acc = float(total_correct)/total
    print('Total: {:d}/{:d}, Accuracy: {:.2f}%\n'.format(total_correct, total, 100*total_acc))
    return (sem_acc, syn_acc, total_acc)

def evaluate_accuracy(vecs_file):
    model = KeyedVectors.load_word2vec_format(vecs_file)
    acc = model.accuracy(questions_file)
    _, _, total_acc = print_accuracy(acc)
    return total_acc

In [8]:
print('Evaluating word analogy task...')

acc_neg = []

for dim in dims:
    print('Dimension: {}'.format(dim))
    print('Negative sampling:')
    acc = evaluate_accuracy(save_path + 'vecs_neg_' + str(dim) + '.vec')
    acc_neg.append(acc)
    with open(save_path + 'results_neg_' + str(dim) + '.txt', 'a') as f:
        f.write('Analogy score: {}\n'.format(acc))

Evaluating word analogy task...
Dimension: 50
Negative sampling:
Semantic: 2819/4976, Accuracy: 56.65%
Syntactic: 4726/8429, Accuracy: 56.07%
Total: 7545/13405, Accuracy: 56.28%

Dimension: 100
Negative sampling:
Semantic: 3689/4976, Accuracy: 74.14%
Syntactic: 5699/8429, Accuracy: 67.61%
Total: 9388/13405, Accuracy: 70.03%

Dimension: 400
Negative sampling:
Semantic: 4224/4976, Accuracy: 84.89%
Syntactic: 6102/8429, Accuracy: 72.39%
Total: 10326/13405, Accuracy: 77.03%



In [9]:
# Evaluation functions for the Stanfor rare words dataset. 
# Copied from https://github.com/facebookresearch/fastText/blob/master/eval.py
def compat_splitting(line):
    return line.decode('utf8').split()

def similarity(v1, v2):
    n1 = np.linalg.norm(v1)
    n2 = np.linalg.norm(v2)
    return np.dot(v1, v2) / n1 / n2

def compute_sim_correlation(vecs_file):
    vectors = {}
    fin = open(vecs_file, 'rb')
    for i, line in enumerate(fin):
        try:
            tab = compat_splitting(line)
            vec = np.array(tab[1:], dtype=float)
            word = tab[0]
            if not word in vectors:
                vectors[word] = vec
        except ValueError:
            continue
        except UnicodeDecodeError:
            continue
    fin.close()

    mysim = []
    gold = []
    drop = 0.0
    nwords = 0.0

    fin = open(dataPath, 'rb')
    for line in fin:
        tline = compat_splitting(line)
        word1 = tline[0].lower()
        word2 = tline[1].lower()
        nwords = nwords + 1.0

        if (word1 in vectors) and (word2 in vectors):
            v1 = vectors[word1]
            v2 = vectors[word2]
            d = similarity(v1, v2)
            mysim.append(d)
            gold.append(float(tline[2]))
        else:
            drop = drop + 1.0
    fin.close()

    corr = stats.spearmanr(mysim, gold)
    dataset = os.path.basename(dataPath)
    print("{}: {}  (OOV: {}%)"
    .format(dataset, corr[0] * 100, math.ceil(drop / nwords * 100.0)))
    
    return corr[0]

In [10]:
print('Evaluating word similarity task...')

corr_neg = []

for dim in dims:
    print('Dimension: {}'.format(dim))
    print('Negative sampling:')
    c = compute_sim_correlation(save_path + 'vecs_neg_' + str(dim) + '.vec')
    corr_neg.append(c)
    with open(save_path + 'results_neg_' + str(dim) + '.txt', 'a') as f:
        f.write('Similarity correlation: {}\n'.format(acc))

Evaluating word similarity task...
Dimension: 50
Negative sampling:
rw.txt: 43.397895191953054  (OOV: 24%)
Dimension: 100
Negative sampling:
rw.txt: 44.95494757620857  (OOV: 24%)
Dimension: 400
Negative sampling:
rw.txt: 47.1176675488831  (OOV: 24%)


In [11]:
def plot_results(huffman_score, gmm_score, figure_title, ylabel, ylimits):

    pos = range(3)
    width = 0.25

    fig, ax = plt.subplots(figsize=(10,5))

    plt.bar([p - 0.5*width for p in pos],
            huffman_score,
            width,
            alpha=0.5,
            color='r')

    plt.bar([p + 0.5*width for p in pos],
            gmm_score,
            width,
            alpha=0.5,
            color='b')

    ax.set_ylabel(ylabel)
    ax.set_xlabel('Embedding dimension')
    ax.set_title(figure_title)
    ax.set_xticks(pos)
    ax.set_xticklabels([50, 100, 400])
    plt.xlim(min(pos)-2*width, max(pos)+width*2)
    plt.ylim(ylimits)

    plt.legend(['Huffman tree', 'GMM tree'], loc='upper left')
    plt.grid()
    plt.show()

In [12]:
#plot_results(acc_huffman, acc_gmm, 'Word analogy task', 'Accuracy', [0, 0.8])

In [13]:
#plot_results(corr_huffman, corr_gmm, 'Rare words similarity task', 'Correlation', [0, 0.6])