## Code ([skip to narrative](#Outline)]

In [None]:
from pathlib import Path

from bokeh.plotting import output_notebook, figure, ColumnDataSource, show
from bokeh.core.properties import value
from bokeh.models import Arrow, NormalHead
from bokeh.layouts import gridplot
from IPython.core.display import display, HTML

from util import read_word_list, read_word_groups
from training import create_fasttext_model

output_notebook()

In [None]:
def _new_section():
    
    state = {
        'section': 1,
    }
    
    def _display_new_section(title):
        display(HTML(f'<h3>{state["section"]}. {title}</h3>'))
        state['section'] += 1
        state['subsection'] = 1
        
    return _display_new_section

new_section = _new_section()

def pca_diagram():
    import math
    
    pairs = [
        [['she', .9], ['he', .65]],
        [['woman', .95], ['man', .55]],
    ]
    
    embedding_fig = figure(width=300, height=300, x_range=[-1.1, 1.1], y_range=[-1.1, 1.1])
    embedding_fig.line(
        x=[math.sin(i * 2 * math.pi / 200) for i in range(200)],
        y=[math.cos(i * 2 * math.pi / 200) for i in range(200)],
    )
    for pair in pairs:
        midpoint = [
            sum(math.sin(fraction * 2 * math.pi) for _, fraction in pair) / 2,
            sum(math.cos(fraction * 2 * math.pi) for _, fraction in pair) / 2,
        ]
        embedding_fig.x(x=midpoint[0], y=midpoint[1], line_width=3, size=10, color='#000000')
        for word, fraction in pair:
            x_pos = math.sin(fraction * 2 * math.pi)
            y_pos = math.cos(fraction * 2 * math.pi)
            embedding_fig.line(x=[midpoint[0], x_pos], y=[midpoint[1], y_pos], line_color='#000000')
            embedding_fig.add_layout(Arrow(
                x_start=midpoint[0], y_start=midpoint[1],
                x_end=x_pos, y_end=y_pos,
                end=NormalHead(size=10),
            ))
            embedding_fig.text(x=x_pos, y=y_pos, text=value(word))
            
    # FIXME
    pca_vector = [[], []]
    
    pca_fig = figure(width=300, height=300, x_range=[-1.1, 1.1], y_range=[-1.1, 1.1])
    pca_fig.x(x=0, y=0, line_width=3, size=10, color='#000000')
    for pair in pairs:
        midpoint = [
            sum(math.sin(fraction * 2 * math.pi) for _, fraction in pair) / 2,
            sum(math.cos(fraction * 2 * math.pi) for _, fraction in pair) / 2,
        ]
        for i, (word, fraction) in enumerate(pair):
            x_pos = math.sin(fraction * 2 * math.pi) - midpoint[0]
            y_pos = math.cos(fraction * 2 * math.pi) - midpoint[1]
            pca_fig.line(x=[0, x_pos], y=[0, y_pos], line_color='#000000')
            pca_fig.add_layout(Arrow(
                x_start=0, y_start=0, x_end=x_pos, y_end=y_pos,
                end=NormalHead(size=10),
            ))
            pca_fig.text(x=x_pos, y=y_pos, text=value(word))
            if i == 0:
                pca_vector[0].append(x_pos)
                pca_vector[1].append(y_pos)
    pca_vector = [
        sum(pca_vector[0]) / len(pairs),
        sum(pca_vector[1]) / len(pairs),
    ]
    pca_fig.line(
        x=[i * pca_vector[0] for i in range(-200, 200)],
        y=[i * pca_vector[1] for i in range(-200, 200)],
        line_color='#C40000',
    )
    
            
    gender_fig = figure(width=300, height=300, x_range=[-1.1, 1.1], y_range=[-1.1, 1.1])
    gender_fig.line(
        x=[math.sin(i * 2 * math.pi / 200) for i in range(200)],
        y=[math.cos(i * 2 * math.pi / 200) for i in range(200)],
    )
    for pair in pairs:
        midpoint = [
            sum(math.sin(fraction * 2 * math.pi) for _, fraction in pair) / 2,
            sum(math.cos(fraction * 2 * math.pi) for _, fraction in pair) / 2,
        ]
        gender_fig.x(
            x=midpoint[0],
            y=midpoint[1],
            line_width=3,
            size=10,
            color='#000000',
        )
        for word, fraction in pair:
            x_pos = math.sin(fraction * 2 * math.pi)
            y_pos = math.cos(fraction * 2 * math.pi)
            gender_fig.line(x=[midpoint[0], x_pos], y=[midpoint[1], y_pos], line_color='#000000')
            gender_fig.add_layout(Arrow(
                x_start=midpoint[0], y_start=midpoint[1],
                x_end=x_pos, y_end=y_pos,
                end=NormalHead(size=10),
            ))
            gender_fig.text(x=x_pos, y=y_pos, text=value(word))
    gender_fig.line(
        x=[i * pca_vector[0] for i in range(-200, 200)],
        y=[i * pca_vector[1] for i in range(-200, 200)],
        line_color='#C40000',
    )
                
    show(gridplot([[embedding_fig, pca_fig, gender_fig]]))

## Outline

Load in some data first

In [None]:
corpus_path = Path('corpora/wikipedia-1')
# load in the relevant data
gender_pairs_file = Path('data/gender-pairs/definitional')
gender_pairs = read_word_groups(gender_pairs_file)
gendered_words = read_word_list(Path('data/gendered-words/gender_specific_seed'))
equalize_pairs = read_word_groups(Path('data/gender-pairs/equalize'))

In [None]:
new_section('The Bolukbasi Model')

Let's take a look at gender in the un-debiased baseline model:

In [None]:
baseline_model = create_fasttext_model(
    corpus_path,
    out_path=Path('models/paper-baseline.w2v')
)

In their 2017 paper, Bolukbasi et al. uses the following words to define the gender subspace:

In [None]:
gender_pairs

To calculate the gender subspace, they first make all the word vectors relative to the midpoint of each pair of words. The first principal component of those vectors is the gender subspace. The diagram below shows a simplified example, defining a 1D gender subspace in a 2D embedding, with the first principal component shown in red:

In [None]:
pca_diagram()

Since the gender pairs are not exactly aligned, the more gender pairs that are used, the more principal components there would be. Based on the definitional gender pairs used by Bolukbasi et al, PCA results in the following amount of variation expliained by each component:

In [None]:
def gender_variance():

    from linalg import recenter, normalize
    import numpy as np
    from sklearn.decomposition import PCA

    matrix = []
    for male_word, female_word in [['man', 'woman'], ['boy', 'girl'], ['he', 'she']]:
        if male_word not in baseline_model or female_word not in baseline_model:
            continue
        matrix.extend(recenter(
            np.array([baseline_model[male_word], baseline_model[female_word]])
        ))
    matrix = np.array(matrix)

    pca = PCA()
    pca.fit(matrix)
    components = normalize(pca.components_)

    total_variance = sum(pca.explained_variance_)
    return pca.explained_variance_ / total_variance

for i, variance in enumerate(gender_variance(), start=1):
    print(f'Component {i}: {variance:.2%}')

In [None]:
new_section('Gender is not one-dimensional')

In [None]:
new_section('Rejection degrades performance')

In [None]:
new_section('Approach is valid')

The first thing we want to do is to show that this swapping approach works. Unfortunately, the previous evaluation approach cannot be used to evaluate a swapped model, so we need a new analogical evaluation. The expected results look something like this:

```
METRIC          BASELINE      SWAPPED       BOLUKBASI
projection
    adjectives  0.0136640928  0.0311164421  0.0000898000
    occupation  0.0096600337  0.0103131604  0.0000000066
analogy
    adjectives  0.1079770610  0.0031673744  0.0000470000
    occupation  0.0759830767  0.0007209567  0.0000000016
```

For the purpose of this small experiment, we will use a small Wikipedia corpus, and the 

In [None]:
def validate_approach():
    # settings
    corpus_path = Path('corpora/wikipedia-1')
    # load in the relevant data
    gender_pairs = read_gender_pairs(Path('data/gender-pairs/definitional'))
    gender_words = read_word_list(Path('data/gendered-words/gender_specific_seed'))
    equalize_pairs = read_gender_pairs(Path('data/gender-pairs/equalize'))
    # create the models
    baseline_model = load_word2vec_embedding(
        corpus_path,
        outpath=Path('models/paper-validation-baseline.w2v')
    )
    bolukbasi_model = debias_bolukbasi_original(
        baseline_model,
        gender_pairs,
        gendered_words,
        equalize_pairs,
        outpath=Path('models/paper-validation-bolukbasi.w2v'),
    )
    swapped_model = load_word2vec_embedding(
        create_randomized_swapped_corpus(
            corpus_path,
            gender_pairs,
            outpath=Path('corpora/wikipedia-1-paper-swapped'),
        ),
        outpath=Path('models/paper-validation-swapped.w2v')
    )

# validate_approach()