In [33]:
from pathlib import Path

from IPython.core.display import display, HTML

from experiments import read_word_list, read_gender_pairs
from experiments import load_word2vec_embedding, debias_bolukbasi_original, create_randomized_swapped_corpus

In [34]:
def _new_section():
    
    state = {
        'section': 1,
    }
    
    def _display_new_section(title):
        display(HTML(f'<h3>{state["section"]}. {title}</h3>'))
        state['section'] += 1
        state['subsection'] = 1
        
    return _display_new_section

new_section = _new_section()

Load in some data first

In [39]:
corpus_path = Path('corpora/wikipedia-1')
# load in the relevant data
gender_pairs_file = Path('data/gender-pairs/definitional')
gender_pairs = read_gender_pairs(gender_pairs_file)
gendered_words = read_word_list(Path('data/gendered-words/gender_specific_seed'))
equalize_pairs = read_gender_pairs(Path('data/gender-pairs/equalize'))

In [35]:
new_section('Gender is not one-dimensional')

Let's take a look at the 

In [None]:
baseline_model = load_word2vec_embedding(
    corpus_path,
    out_path=Path('models/paper-validation-baseline.w2v')
)


In [36]:
new_section('Rejection degrades performance')

In [37]:
new_section('Approach is valid')

The first thing we want to do is to show that this swapping approach works. Unfortunately, the previous evaluation approach cannot be used to evaluate a swapped model, so we need a new analogical evaluation. The expected results look something like this:

```
METRIC          BASELINE      SWAPPED       BOLUKBASI
projection
    adjectives  0.0136640928  0.0311164421  0.0000898000
    occupation  0.0096600337  0.0103131604  0.0000000066
analogy
    adjectives  0.1079770610  0.0031673744  0.0000470000
    occupation  0.0759830767  0.0007209567  0.0000000016
```

For the purpose of this small experiment, we will use a small Wikipedia corpus, and the 

In [38]:
def validate_approach():
    # settings
    corpus_path = Path('corpora/wikipedia-1')
    # load in the relevant data
    gender_pairs = read_gender_pairs(Path('data/gender-pairs/definitional'))
    gender_words = read_word_list(Path('data/gendered-words/gender_specific_seed'))
    equalize_pairs = read_gender_pairs(Path('data/gender-pairs/equalize'))
    # create the models
    baseline_model = load_word2vec_embedding(
        corpus_path,
        outpath=Path('models/paper-validation-baseline.w2v')
    )
    bolukbasi_model = debias_bolukbasi_original(
        baseline_model,
        gender_pairs,
        gendered_words,
        equalize_pairs,
        outpath=Path('models/paper-validation-bolukbasi.w2v'),
    )
    swapped_model = load_word2vec_embedding(
        create_randomized_swapped_corpus(
            corpus_path,
            gender_pairs,
            outpath=Path('corpora/wikipedia-1-paper-swapped'),
        ),
        outpath=Path('models/paper-validation-swapped.w2v')
    )

# validate_approach()