In [19]:
import urllib.request
import numpy as np

### Load further embeddings

In [20]:
def read_embedding(url, skip_first = False):
    """Function to read out an embedding
    Input: url: url to embedding
    
    Returns: vocab: list of words in the embedding
             w2id: dictionary mapping words to ids
             embedding: array storing the word vectors,
                           row corresponds to word id"""
    # Open url
    data = urllib.request.urlopen(url)
    vocab = []
    embedding = []
    
    # Each line contains one word and its embedding
    for i, line in enumerate(data):
        if skip_first:
            if i == 0:
                continue
        #if len(line) == 301:
        line = line.decode()
        # Split by spaces
        split = line.split()
        # First element(== the word) is added to vocabulary
        vocab.append(split[0])
        # All other elements(embedding vectors) are added to vectors
        embedding.append([float(elem) for elem in split[1:]])
    
    # Create a dictionary with word-id pairs based on the order
    w2id = {w: i for i, w in enumerate(vocab)}
    # Vectors are converted into an array
    embedding = np.array(embedding).astype(float)
    
    return vocab, w2id, embedding

In [21]:
embedding_gn_url = "http://www.cs.virginia.edu/~tw8cb/word_embeddings/vectors300.txt"
vocab_gn_original, w2id_gn_original, embedding_gn_original = read_embedding(embedding_gn_url)
vocab_gn, w2id_gn, embedding_gn = restrict_vocab(vocab_gn_original, w2id_gn_original, embedding_gn_original)

ValueError: setting an array element with a sequence.

In [7]:
def debias_gn(wv):
    for v in wv:
        assert(len(v) == 300)
    
    wv = wv[:,:-1]

    for v in wv:
        assert(len(v) == 299)
    return wv

vocab_gn_a = vocab_gn
w2id_gn_a = w2id_gn
embedding_gn_a = debias_gn(embedding_gn)

In [None]:
embedding_hd_url = "http://www.cs.virginia.edu/~tw8cb/word_embeddings/vectors_hd.txt"
vocab_hd_original, w2id_hd_original, embedding_hd_original = read_embedding(embedding_hd_url)
vocab_hd, w2id_hd, embedding_hd = restrict_vocab(vocab_hd_original, w2id_hd_original, embedding_hd_original)

In [None]:
embedding_hd_a_url = "http://www.cs.virginia.edu/~tw8cb/word_embeddings/vectors_hd_a.txt"
vocab_hd_a_original, w2id_hd_a_original, embedding_hd_a_original = read_embedding(embedding_hd_a_url)
vocab_hd_a, w2id_hd_a, embedding_hd_a = restrict_vocab(vocab_hd_a_original, w2id_hd_a_original, embedding_hd_a_original)

In [None]:
embedding_gp_url = "http://www.cs.virginia.edu/~tw8cb/word_embeddings/gp_glove.txt"
vocab_gp_original, w2id_gp_original, embedding_gp_original = read_embedding(embedding_gp_url, skip_first = True)
vocab_gp, w2id_gp, embedding_gp = restrict_vocab(vocab_gp_original, w2id_gp_original, embedding_gp_original)

In [38]:
embedding_gp_gn_url = "http://www.cs.virginia.edu/~tw8cb/word_embeddings/gp_gn_glove.txt"
vocab_gp_gn_original, w2id_gp_gn_original, embedding_gp_gn_original = read_embedding(embedding_gp_gn_url, skip_first = True)
vocab_gp_gn, w2id_gp_gn, embedding_gp_gn = restrict_vocab(vocab_gp_gn_original, w2id_gp_gn_original, embedding_gp_gn_original)

Original vocab size:  322636
Restricted vocab size:  314952


### Word Embedding Association Test
Implementation taken from "https://github.com/shivaomrani/HumanBiasInSemantics". 
With minor adjustments such as variable names for readability, see file "weat.py".

In [11]:
from utils import weat
# auf embedding zugreifen, wenn wir das als .py haben

In [6]:
# Career and family
# Change from Bill to Tom as in paper to avoid ambiguity
male_names = ["john", "paul", "mike", "kevin", "steve", "greg", "jeff", "tom"]
female_names = ["amy", "joan", "lisa", "sarah", "diana", "kate", "ann", "donna"]
career_attributes = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
family_attributes = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]

In [7]:
# Math and arts
math_words = ["math", "algebra", "geometry", "calculus", "equations", "computation", "numbers", "addition"]
arts_words1 = ["poetry", "art", "dance", "literature", "novel", "symphony", "drama", "sculpture"]
male_attributes1 = ["male", "man", "boy", "brother", "he", "him", "his", "son"]
female_attributes1 = ["female", "woman", "girl", "sister", "she", "her", "hers", "daughter"]

In [8]:
# Science and arts
science_words = ["science", "technology", "pyhsics", "chemistry", "einstein", "nasa", "experiment", "astronomy"]
arts_words2 = ["poetry", "art", "shakespeare", "dance", "literature", "novel", "symphony", "drama"]
male_attributes2 = ["brother", "father", "uncle", "grandfather", "son", "he", "his", "him"]
female_attributes2 = ["sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]

In [None]:
iterations = 100000
wea_test = weat(male_names, female_names, career_attributes, family_attributes, iterations, embedding, w2id)
pvalue, effect_size, sd = wea_test.getPValueAndEffect()
print("p-value: ", pvalue)
print("effect size: ", effect_size)
print("standard deviation: ", sd)

### Word Analogy
The word analogy task is to find a word *D* such that "*A* is to *B* as *C* is to *D*". Wang et al. (2020) have evaluated all non-debiased and debiased embeddings on the MSR word analogy task (Mikolov et al., 2013 Ling) as well as on a second Google word analogy dataset (Mikolov et al. 2013 Eff). The evaluation metric is the percentage of questions for which the correct answer is assigned the maximum score by the algorithm. The analogy task is used to show whether a debiasing method is capable of preserving desired distance relations between words.

In [23]:
from word_analogy import analogy_tasks

ModuleNotFoundError: No module named 'web'