### Initialize

In [42]:
from __future__ import print_function, division
%matplotlib inline
from matplotlib import pyplot as plt
import json
import random
import debiaswe as dwe
import debiaswe.we as we
from debiaswe.we import WordEmbedding
from debiaswe.debias import debias
import vector_math_jb as vm 

# Standard python helper libraries.
import os, sys, re, json, time
import itertools, collections
#from importlib import reload
from IPython.display import display

# NumPy and SciPy for matrix ops
import numpy as np
import scipy.sparse

# Visualize
import matplotlib.pyplot as plt

# NLTK for NLP utils
import nltk

# PCA 
from sklearn.decomposition import PCA

# Helper libraries
from w266_common import utils, vocabulary#, tf_embed_viz

## Load embeddings

In [2]:
# load subset of word embedding trained on Google News text
E_gn = WordEmbedding("./embeddings/w2v_gnews_small.txt")
E_jp = WordEmbedding("./embeddings/article_embed.txt")

*** Reading data from ./embeddings/w2v_gnews_small.txt
(26423, 300)
26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine
*** Reading data from ./embeddings/article_embed.txt
(462643, 50)
462643 words of dimension 50 : the, to, of, and, ..., sadaoui, craythorn, calcraft, coventon
462643 words of dimension 50 : the, to, of, and, ..., sadaoui, craythorn, calcraft, coventon


### 1. Word Similarity Evaluation

In [74]:
# adapted from Faruqui and Dyer, Community Evaluation and Exchange of Word Vectors at wordvectors.org
# RG = Rubenstein and Goodenough, 1965; WS = Finkelstein et. al, 2002 -- benchmarks used in Bolukbasi et al.

%run 'evaluation/all_wordsim_jb.py' 'embeddings/' 'evaluation/data/word-sim/'

File #                     Embeddings      RG (53/65)    WS (318/353)
     1                  article_embed          0.5378          0.5702 

     2          RANE_300d_english_50k          0.6052          0.6159 

     3                w2v_gnews_small          0.7618          0.6857 

     4       w2v_gnews_small_debiased          0.7601          0.6826 



### 3. MSR Analogies Eval

In [24]:
with open('./evaluation/data/analogies.json', "r") as f:
    analogies = json.load(f) # This is HUGE

In [137]:
def compare_analogies(analogy_list,e):
    num_analogies = 0
    correct = 0  
    num_missing = 0
    keep_analogies = []
    t0 = time.time()
    print_freq = 500

    for i,wds in enumerate(analogy_list):
        # we'll get KeyError if a word can't be found
        try:
            
            # generate "d" in a:b::c:d analogies, given a, b, and c
            d = vm.show_analogy(e,wds[0],wds[1],wds[2],5)[0] # needs to be at least 2 

            # keep track of how many analogies were computed
            num_analogies += 1
            keep_analogies.append(wds)

            if d == wds[3]:
                correct += 1
                
        except:
            #bad_analogies.append(wds)
            num_missing += 1
        
        if i % print_freq  == 0:
            print("Completed {:d} analogies in {:s}".format(i, utils.pretty_timedelta(since=t0)))
    
    try: # if we're running on a subset, we may actually not be able to do _any_ analogies
        score = correct / num_analogies
    except:
        score = 0
                
    print("Computed {:d}/{:d} analogies correctly in {:s}, accuracy: {:.2f}".format(correct,num_analogies,utils.pretty_timedelta(since=t0),score))
    
    return keep_analogies

In [138]:
compare_analogies(analogies,E_gn) 

Completed 0 analogies in 0:00:00
Completed 500 analogies in 0:00:00
Completed 1000 analogies in 0:00:00
Completed 1500 analogies in 0:00:00
Completed 2000 analogies in 0:00:00
Completed 2500 analogies in 0:00:00
Completed 3000 analogies in 0:00:00
Completed 3500 analogies in 0:00:00
Completed 4000 analogies in 0:00:00
Completed 4500 analogies in 0:00:00
Completed 5000 analogies in 0:00:00
Completed 5500 analogies in 0:00:00
Completed 6000 analogies in 0:00:00
Completed 6500 analogies in 0:00:00
Completed 7000 analogies in 0:00:00
Completed 7500 analogies in 0:00:00
Completed 8000 analogies in 0:00:00
Completed 8500 analogies in 0:01:16
Completed 9000 analogies in 0:05:19
Completed 9500 analogies in 0:10:06
Completed 10000 analogies in 0:15:25
Completed 10500 analogies in 0:19:37
Completed 11000 analogies in 0:23:42
Completed 11500 analogies in 0:28:49
Completed 12000 analogies in 0:33:19
Completed 12500 analogies in 0:37:50
Completed 13000 analogies in 0:41:54
Completed 13500 analogies

[[u'boy', u'girl', u'brother', u'sister'],
 [u'boy', u'girl', u'brothers', u'sisters'],
 [u'boy', u'girl', u'dad', u'mom'],
 [u'boy', u'girl', u'father', u'mother'],
 [u'boy', u'girl', u'grandfather', u'grandmother'],
 [u'boy', u'girl', u'grandpa', u'grandma'],
 [u'boy', u'girl', u'grandson', u'granddaughter'],
 [u'boy', u'girl', u'groom', u'bride'],
 [u'boy', u'girl', u'he', u'she'],
 [u'boy', u'girl', u'his', u'her'],
 [u'boy', u'girl', u'husband', u'wife'],
 [u'boy', u'girl', u'king', u'queen'],
 [u'boy', u'girl', u'man', u'woman'],
 [u'boy', u'girl', u'nephew', u'niece'],
 [u'boy', u'girl', u'policeman', u'policewoman'],
 [u'boy', u'girl', u'prince', u'princess'],
 [u'boy', u'girl', u'son', u'daughter'],
 [u'boy', u'girl', u'sons', u'daughters'],
 [u'boy', u'girl', u'stepfather', u'stepmother'],
 [u'boy', u'girl', u'stepson', u'stepdaughter'],
 [u'boy', u'girl', u'uncle', u'aunt'],
 [u'brother', u'sister', u'brothers', u'sisters'],
 [u'brother', u'sister', u'dad', u'mom'],
 [u'brot

In [128]:
def intersect(l1, l2):
    return [wds for wds in l1 if wds in l2]

In [136]:
intersect(keep,test)

[[u'write', u'writes', u'talk', u'talks'],
 [u'write', u'writes', u'think', u'thinks'],
 [u'write', u'writes', u'vanish', u'vanishes'],
 [u'write', u'writes', u'walk', u'walks'],
 [u'write', u'writes', u'work', u'works']]

In [139]:
keep[:10]

[[u'write', u'writes', u'talk', u'talks'],
 [u'write', u'writes', u'think', u'thinks'],
 [u'write', u'writes', u'vanish', u'vanishes'],
 [u'write', u'writes', u'walk', u'walks'],
 [u'write', u'writes', u'work', u'works']]