In [1]:
import urllib.request
import numpy as np
import pandas as pd
import pickle
import utils
import further_embeddings as data # takes a while because it loads all embedding datasets

successfully loaded utils


In [2]:
# load the two debiased embeddings obtained by our implementation of Double-Hard Debias
file_1 = open('debiased_1/debiased_1.p', 'rb')
debiased_equal = pickle.load(file_1)
file_1.close()
file_2 = open('debiased_2/debiased_2.p', 'rb')
debiased_fem_male = pickle.load(file_2)
file_2.close()
# load the Double-Hard debiased embeddings obtained by Wang et al. (2020)
file_3 = open('glove_dhd.p', 'rb')
glove_dhd_wang = pickle.load(file_3)
file_3.close()

In [None]:
dict_embeddings = {'original_glove': (embedding, vocab, w2id), 'glove_dhd_wang': (glove_dhd_wang, vocab, w2id), 
                   'debiased_equal': (debiased_equal, vocab, w2id), 'debiased_fem_male': (debiased_fem_male, vocab, w2id)}

## Word Embedding Association Test
Implementation taken from https://github.com/shivaomrani/HumanBiasInSemantics with minor adjustments such as variable names for readability. See file `weat.py`.

In [3]:
from weat import weat
# auf embedding zugreifen, wenn wir das als Datei haben

In [4]:
# Career and family
# Change from Bill to Tom as in paper to avoid ambiguity
male_names = ["john", "paul", "mike", "kevin", "steve", "greg", "jeff", "tom"]
female_names = ["amy", "joan", "lisa", "sarah", "diana", "kate", "ann", "donna"]
career_attributes = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
family_attributes = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]

In [5]:
# Math and arts
math_words = ["math", "algebra", "geometry", "calculus", "equations", "computation", "numbers", "addition"]
arts_words1 = ["poetry", "art", "dance", "literature", "novel", "symphony", "drama", "sculpture"]
male_attributes1 = ["male", "man", "boy", "brother", "he", "him", "his", "son"]
female_attributes1 = ["female", "woman", "girl", "sister", "she", "her", "hers", "daughter"]

In [6]:
# Science and arts
science_words = ["science", "technology", "physics", "chemistry", "einstein", "nasa", "experiment", "astronomy"]
arts_words2 = ["poetry", "art", "shakespeare", "dance", "literature", "novel", "symphony", "drama"]
male_attributes2 = ["brother", "father", "uncle", "grandfather", "son", "he", "his", "him"]
female_attributes2 = ["sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]

In [7]:
iterations = 100000
embeddings = [data.embedding, data.embedding_gn, data.embedding_gn_a, data.embedding_gp, data.embedding_gp_gn, data.embedding_hd, data.embedding_hd_a, debiased_equal, debiased_fem_male, glove_dhd_wang]
w2ids = [data.w2id, data.w2id_gn, data.w2id_gn_a, data.w2id_gp, data.w2id_gp_gn, data.w2id_hd, data.w2id_hd_a, data.w2id, data.w2id, data.w2id]
concept1 = [male_names, math_words, science_words]
concept2 = [female_names, arts_words1, arts_words2]
stereotype1 = [career_attributes, male_attributes1, male_attributes2]
stereotype2 = [family_attributes, female_attributes1, female_attributes2]
row_names = ["GloVe", "GN-GloVe", "GN-GloVe(a)", "GP-GloVe", "GP-GN-GloVe", "Hard-GloVe", "Strong-Hard-GloVe", "Double-Hard-GloVe(Equal)", "Double-Hard-GloVe(Fem-Male)", "Double-Hard-Glove(Wang)"]
column_names = ["C & F: d", "C & F: p", "M & A: d", "M & A: p", "S & A: d", "S & A: p"]
test_results = []

for i, embedding in enumerate(embeddings):
    result = []
    print(row_names[i])
    for j in range(3):
        wea_test = weat(concept1[j], concept2[j], stereotype1[j], stereotype2[j], iterations, embedding, w2ids[i])
        pvalue, effect_size, _ = wea_test.getPValueAndEffect()
        result.append(effect_size)
        result.append(pvalue)
    test_results.append(result)
    
test_results = np.array(test_results).astype(float)
df = pd.DataFrame(data = test_results, index = row_names, columns = column_names)
df

GloVe


NameError: name 'w2ids' is not defined

## Word Analogy
The word analogy task is to find a word *D* such that "*A* is to *B* as *C* is to *D*". Wang et al. (2020) have evaluated all non-debiased and debiased embeddings on the MSR word analogy task [Mikolov et al., 2013a](https://www.aclweb.org/anthology/N13-1090/) as well as on a second Google word analogy dataset [Mikolov et al. 2013b](https://arxiv.org/abs/1301.3781v3). The evaluation metric is the percentage of questions for which the correct answer is assigned the maximum score by the algorithm. The analogy task is used to show whether a debiasing method is capable of preserving desired distance relations between words. The implementation was taken from Wang et al. (2020) with some adjustments to get it running: `analogy_tasks.py`

In [None]:
import analogy_tasks as ana

### MSR

In [19]:
# MSR on all embeddings:
for key, (embedding, vocab, w2id) in dict_embeddings.items():
    print(key, ": ")
    ana.evaluate_analogy_msr(embedding, vocab, w2id)

4884
ACCURACY TOP1-MSR: 54.40% (2657/4884)


In [20]:
# ana.evaluate_analogy_msr(debiased_1, data.vocab, data.w2id)

In [21]:
ana.evaluate_analogy_google(data.embedding, data.vocab, data.w2id)

capital-common-countries.txt:
ACCURACY TOP1: 98.95% (283/286)
capital-world.txt:
ACCURACY TOP1: 94.69% (1409/1488)
currency.txt:
ACCURACY TOP1: 7.63% (18/236)
city-in-state.txt:
ACCURACY TOP1: 77.49% (1855/2394)
family.txt:
ACCURACY TOP1: 71.67% (301/420)
gram1-adjective-to-adverb.txt:
ACCURACY TOP1: 9.25% (86/930)
gram2-opposite.txt:
ACCURACY TOP1: 29.22% (135/462)
gram3-comparative.txt:
ACCURACY TOP1: 78.68% (1048/1332)
gram4-superlative.txt:
ACCURACY TOP1: 46.55% (378/812)
gram5-present-participle.txt:
ACCURACY TOP1: 47.42% (441/930)
gram6-nationality-adjective.txt:
ACCURACY TOP1: 93.23% (1418/1521)
gram7-past-tense.txt:
ACCURACY TOP1: 34.87% (544/1560)
gram8-plural.txt:
ACCURACY TOP1: 75.46% (898/1190)
gram9-plural-verbs.txt:
ACCURACY TOP1: 45.57% (370/812)
Questions seen/total: 73.54% (14373/19544)
Semantic accuracy: 80.14%  (3866/4824)
Syntactic accuracy: 55.69%  (5318/9549)
Total accuracy: 63.90%  (9184/14373)


In [22]:
# ana.evaluate_analogy_google(debiased_2, data.vocab, data.w2id)