### Initialize

In [7]:
# Install a few python packages using pip
#from w266_common import utils
#utils.require_package("wget")      # for fetching dataset

# Standard python helper libraries.
import os, sys, re, json, time
import itertools, collections
from importlib import reload
from IPython.display import display

# NumPy and SciPy for matrix ops
import numpy as np
import scipy.sparse

# NLTK for NLP utils
import nltk

# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz

  from ._conv import register_converters as _register_converters


### Import GloVe dataset
Contains 6B tokens from Wikipedia 2014 + Gigawords 5.

(will take a while to run)

In [8]:
import glove_helper; reload(glove_helper)

hands = glove_helper.Hands(ndim=100)  # 50, 100, 200, 300 dim are available

Downloading GloVe vectors to data/glove
Loading vectors from data/glove/glove.6B.zip
Parsing file: data/glove/glove.6B.zip:glove.6B.100d.txt
Found 400,000 words.
Parsing vectors... Done! (W.shape = (400003, 100))


In [14]:
import vector_math; reload(vector_math)

def show_nns(hands, word, k=10):
    """Helper function to print neighbors of a given word."""
    word = word.lower()
    print("Nearest neighbors for '{:s}'".format(word))
    v = hands.get_vector(word)
    for i, sim in zip(*vector_math.find_nn_cos(v, hands.W, k)):
        target_word = hands.vocab.id_to_word[i]
        print("{:.03f} : '{:s}'".format(sim, target_word))
    print("")
    
def show_analogy(hands, a, b, c, k=5):
    """Compute and print a vector analogy."""
    a, b, c = a.lower(), b.lower(), c.lower()
    va = hands.get_vector(a)
    vb = hands.get_vector(b)
    vc = hands.get_vector(c)
    print("'{a:s}' is to '{b:s}' as '{c:s}' is to ___".format(**locals()))
    for i, sim in zip(*vector_math.analogy(va, vb, vc, hands.W, k)):
        target_word = hands.vocab.id_to_word[i]
        print("{:.03f} : '{:s}'".format(sim, target_word))
    print("")


In [44]:
show_nns(hands, "white", 5)
show_nns(hands, "black", 5)
show_nns(hands, "asian", 5)


show_nns(hands, "whites", 5)
show_nns(hands, "blacks", 5)
show_nns(hands, "asians", 5)
# "anglos", "whites", "caucasians", "caucasian", "anglo"
# "latino", "latinos", "hispanics", "hispanic", "mexicans", "chicanos"
# "blacks", "african-americans", "africans", "african-american", "colored"
# "asians", "asian"
# "amerindians", "native-american"

# Racially-charged stereotypes we could cover - 
# areas of academia
# 

Nearest neighbors for 'white'
0.768 : 'red'
0.784 : 'gray'
0.818 : 'brown'
0.865 : 'black'
1.000 : 'white'

Nearest neighbors for 'black'
0.768 : 'red'
0.770 : 'gray'
0.807 : 'blue'
0.865 : 'white'
1.000 : 'black'

Nearest neighbors for 'asian'
0.678 : 'european'
0.702 : 'world'
0.708 : 'african'
0.803 : 'asia'
1.000 : 'asian'

Nearest neighbors for 'whites'
0.730 : 'hispanics'
0.733 : 'latinos'
0.750 : 'asians'
0.847 : 'blacks'
1.000 : 'whites'

Nearest neighbors for 'blacks'
0.751 : 'asians'
0.755 : 'hispanics'
0.755 : 'latinos'
0.847 : 'whites'
1.000 : 'blacks'

Nearest neighbors for 'asians'
0.751 : 'blacks'
0.784 : 'africans'
0.809 : 'hispanics'
0.828 : 'latinos'
1.000 : 'asians'



Interesting note - some of the singular forms of racial groups (e.g. "white", "black", "asian") do not seem to encode the concept of race. The plural forms, however, consistently encode racial meanings.

In [53]:
show_analogy(hands, "hispanics", "hispanics", "science")

show_analogy(hands, "asians", "hispanics", "science")

show_analogy(hands, "blacks", "hispanics", "science")

show_analogy(hands, "native-american", "hispanic", "science")

show_analogy(hands, "whites", "hispanics", "science")

'hispanics' is to 'hispanics' as 'science' is to ___
0.761 : 'mathematics'
0.766 : 'institute'
0.791 : 'physics'
0.807 : 'sciences'
1.000 : 'science'

'asians' is to 'hispanics' as 'science' is to ___
0.710 : 'institute'
0.713 : 'research'
0.733 : 'education'
0.753 : 'sciences'
0.876 : 'science'

'blacks' is to 'hispanics' as 'science' is to ___
0.675 : 'studies'
0.676 : 'mathematics'
0.684 : 'humanities'
0.728 : 'sciences'
0.831 : 'science'

'native-american' is to 'hispanic' as 'science' is to ___
0.700 : 'institute'
0.701 : 'education'
0.714 : 'studies'
0.734 : 'hispanic'
0.781 : 'science'

'whites' is to 'hispanics' as 'science' is to ___
0.683 : 'studies'
0.691 : 'biomedical'
0.693 : 'research'
0.706 : 'sciences'
0.834 : 'science'



In [46]:
show_analogy(hands, "hispanics", "hispanics", "science")

show_analogy(hands, "asians", "hispanics", "science")

show_analogy(hands, "blacks", "hispanics", "science")

show_analogy(hands, "amerindians", "hispanics", "science")

show_analogy(hands, "whites", "hispanics", "science")

'hispanics' is to 'hispanics' as 'science' is to ___
0.761 : 'mathematics'
0.766 : 'institute'
0.791 : 'physics'
0.807 : 'sciences'
1.000 : 'science'

'asians' is to 'hispanics' as 'science' is to ___
0.710 : 'institute'
0.713 : 'research'
0.733 : 'education'
0.753 : 'sciences'
0.876 : 'science'

'blacks' is to 'hispanics' as 'science' is to ___
0.675 : 'studies'
0.676 : 'mathematics'
0.684 : 'humanities'
0.728 : 'sciences'
0.831 : 'science'

'amerindians' is to 'hispanics' as 'science' is to ___
0.701 : 'research'
0.715 : 'study'
0.721 : 'studies'
0.748 : 'education'
0.820 : 'science'

'whites' is to 'hispanics' as 'science' is to ___
0.683 : 'studies'
0.691 : 'biomedical'
0.693 : 'research'
0.706 : 'sciences'
0.834 : 'science'



In [47]:
show_analogy(hands, "hispanic", "hispanic", "math")

show_analogy(hands, "asian", "hispanic", "math")

show_analogy(hands, "black", "hispanic", "math")

show_analogy(hands, "native-american", "hispanic", "math")

show_analogy(hands, "white", "hispanic", "math")

'hispanic' is to 'hispanic' as 'math' is to ___
0.646 : 'classroom'
0.648 : 'grade'
0.668 : 'maths'
0.701 : 'mathematics'
1.000 : 'math'

'asian' is to 'hispanic' as 'math' is to ___
0.518 : 'schoolers'
0.527 : 'hispanic'
0.543 : 'k-12'
0.584 : 'graders'
0.729 : 'math'

'black' is to 'hispanic' as 'math' is to ___
0.545 : 'maths'
0.550 : 'graders'
0.589 : 'k-12'
0.619 : 'literacy'
0.749 : 'math'

'native-american' is to 'hispanic' as 'math' is to ___
0.663 : 'school'
0.683 : 'students'
0.706 : 'latino'
0.749 : 'math'
0.762 : 'hispanic'

'white' is to 'hispanic' as 'math' is to ___
0.538 : 'learners'
0.542 : 'maths'
0.558 : 'k-12'
0.631 : 'literacy'
0.738 : 'math'



In [48]:
show_analogy(hands, "black", "black", "science")

show_analogy(hands, "asian", "black", "science")

show_analogy(hands, "white", "black", "science")

show_analogy(hands, "hispanic", "black", "science")

show_analogy(hands, "native-american", "black", "science")

'black' is to 'black' as 'science' is to ___
0.761 : 'mathematics'
0.766 : 'institute'
0.791 : 'physics'
0.807 : 'sciences'
1.000 : 'science'

'asian' is to 'black' as 'science' is to ___
0.566 : 'fiction'
0.567 : 'professor'
0.597 : 'black'
0.619 : 'journalism'
0.691 : 'science'

'white' is to 'black' as 'science' is to ___
0.709 : 'mathematics'
0.712 : 'psychology'
0.715 : 'physics'
0.760 : 'sciences'
0.897 : 'science'

'hispanic' is to 'black' as 'science' is to ___
0.574 : 'professor'
0.591 : 'university'
0.596 : 'master'
0.641 : 'physics'
0.747 : 'science'

'native-american' is to 'black' as 'science' is to ___
0.698 : 'university'
0.708 : '.'
0.725 : 'well'
0.781 : 'black'
0.787 : 'science'



In [49]:
show_analogy(hands, "african-american", "african-american", "science")

show_analogy(hands, "asian", "african-american", "science")

show_analogy(hands, "hispanic", "african-american", "science")

show_analogy(hands, "native-american", "african-american", "science")

show_analogy(hands, "caucasian", "african-american", "science")

'african-american' is to 'african-american' as 'science' is to ___
0.761 : 'mathematics'
0.766 : 'institute'
0.791 : 'physics'
0.807 : 'sciences'
1.000 : 'science'

'asian' is to 'african-american' as 'science' is to ___
0.587 : 'journalism'
0.589 : 'psychology'
0.589 : 'african-american'
0.612 : 'sociology'
0.632 : 'science'

'hispanic' is to 'african-american' as 'science' is to ___
0.630 : 'professor'
0.632 : 'chemistry'
0.664 : 'mathematics'
0.697 : 'physics'
0.791 : 'science'

'native-american' is to 'african-american' as 'science' is to ___
0.725 : 'institute'
0.731 : 'arts'
0.731 : 'studies'
0.743 : 'university'
0.847 : 'science'

'caucasian' is to 'african-american' as 'science' is to ___
0.648 : 'psychology'
0.649 : 'university'
0.656 : 'professor'
0.660 : 'arts'
0.819 : 'science'



In [62]:

show_analogy(hands, "african-american", "african-american", "strong")

show_analogy(hands, "african-american", "asian", "strong")

show_analogy(hands, "african-american", "hispanic", "strong") #!!

show_analogy(hands, "african-american", "caucasian", "strong") #!!

show_analogy(hands, "blacks", "asians", "strong") #!!

show_analogy(hands, "blacks", "hispanics", "strong")

show_analogy(hands, "blacks", "whites", "strong") #!!

'african-american' is to 'african-american' as 'strong' is to ___
0.744 : 'solid'
0.763 : 'weak'
0.767 : 'despite'
0.815 : 'stronger'
1.000 : 'strong'

'african-american' is to 'asian' as 'strong' is to ___
0.697 : 'markets'
0.714 : 'asia'
0.714 : 'stronger'
0.739 : 'asian'
0.754 : 'strong'

'african-american' is to 'hispanic' as 'strong' is to ___
0.664 : 'more'
0.665 : 'likely'
0.673 : 'weak'
0.717 : 'stronger'
0.824 : 'strong'

'african-american' is to 'caucasian' as 'strong' is to ___
0.560 : 'russia'
0.571 : 'clear'
0.578 : 'stronger'
0.599 : 'weak'
0.669 : 'strong'

'blacks' is to 'asians' as 'strong' is to ___
0.593 : 'weaker'
0.647 : 'robust'
0.648 : 'weak'
0.705 : 'stronger'
0.793 : 'strong'

'blacks' is to 'hispanics' as 'strong' is to ___
0.634 : 'weak'
0.650 : 'strongest'
0.668 : 'robust'
0.686 : 'stronger'
0.808 : 'strong'

'blacks' is to 'whites' as 'strong' is to ___
0.669 : 'solid'
0.678 : 'moderate'
0.700 : 'weak'
0.719 : 'stronger'
0.869 : 'strong'

