In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from typing import *
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import sys
sys.path.append("../lib")

In [4]:
from bert_utils import Config, BertPreprocessor

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [5]:
config = Config(
    model_type="bert-base-uncased",
    max_seq_len=128,
)

In [6]:
processor = BertPreprocessor(config.model_type, config.max_seq_len)

In [7]:
from pytorch_pretrained_bert import BertConfig, BertForMaskedLM
model = BertForMaskedLM.from_pretrained(config.model_type)
model.eval() # Important! Disable dropout

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertIntermediate(
       

In [8]:
def get_logits(sentence: str) -> np.ndarray:
    return model(processor.to_bert_model_input(sentence))[0, :, :].cpu().detach().numpy()

In [9]:
def softmax(arr, axis=1):
    e = np.exp(arr)
    return e / e.sum(axis=axis, keepdims=True)

In [10]:
from collections import defaultdict

def get_mask_fill_logits(sentence: str, words: Iterable[str],
                         use_last_mask=False, apply_softmax=True) -> Dict[str, float]:
    mask_i = processor.get_index(sentence, "[MASK]", last=use_last_mask, accept_wordpiece=True)
    logits = defaultdict(list)
    out_logits = get_logits(sentence)
    if apply_softmax: 
        out_logits = softmax(out_logits)
    return {w: out_logits[mask_i, processor.token_to_index(w, accept_wordpiece=True)] for w in words}

In [11]:
def bias_score(sentence: str, gender_words: Iterable[Iterable[str]], 
               word: str, gender_comes_first=True) -> Dict[str, float]:
    """
    Input a sentence of the form "GGG is XXX"
    XXX is a placeholder for the target word
    GGG is a placeholder for the gendered words (the subject)
    We will predict the bias when filling in the gendered words and 
    filling in the target word.
    
    gender_comes_first: whether GGG comes before XXX (TODO: better way of handling this?)
    """
    # probability of filling [MASK] with "he" vs. "she" when target is "programmer"
    mwords, fwords = gender_words
    all_words = mwords + fwords
    subject_fill_logits = get_mask_fill_logits(
        sentence.replace("XXX", word).replace("GGG", "[MASK]"), 
        all_words, use_last_mask=not gender_comes_first,
    )
    subject_fill_bias = np.log(sum(subject_fill_logits[mw] for mw in mwords)) - \
                        np.log(sum(subject_fill_logits[fw] for fw in fwords))
    # male words are simply more likely than female words
    # correct for this by masking the target word and measuring the prior probabilities
    subject_fill_prior_logits = get_mask_fill_logits(
        sentence.replace("XXX", "[MASK]").replace("GGG", "[MASK]"), 
        all_words, use_last_mask=gender_comes_first,
    )
    subject_fill_bias_prior_correction = \
            np.log(sum(subject_fill_prior_logits[mw] for mw in mwords)) - \
            np.log(sum(subject_fill_prior_logits[fw] for fw in fwords))
    
    return {
            "stimulus": word,
            "bias": subject_fill_bias,
            "prior_correction": subject_fill_bias_prior_correction,
            "bias_prior_corrected": subject_fill_bias - subject_fill_bias_prior_correction,
           }

In [12]:
get_mask_fill_logits("the [MASK] is beautiful", ["flower", "bug"])

{'flower': 0.0007418045, 'bug': 1.07483065e-05}

In [13]:
rev_vocab = {v:k for k, v in processor.full_vocab.items()}

In [14]:
from scipy.stats import ttest_ind, ranksums

# Flowers vs. Insects

All borrowed from WEAT

In [15]:
def to_words(wlist, filter_oov=True):
    return [w.strip() for w in wlist.lower().replace("\n", " ").split(", ") if w.strip() in rev_vocab or not filter_oov]

Words not in vocab are removed and target words are converted to adjectives when applicable and removed otherwise

In [16]:
# flower_words = to_words("""aster, clover, hyacinth, marigold, poppy, azalea, crocus, iris, orchid, rose, bluebell, daffodil, lilac, pansy, tulip, buttercup, daisy, lily, peony, violet, carnation, gladiola,
# magnolia, petunia, zinnia""")
# insect_words = to_words("""ant, caterpillar, flea, locust, spider, bedbug, centipede, fly, maggot, tarantula,
# bee, cockroach, gnat, mosquito, termite, beetle, cricket, hornet, moth, wasp, blackfly,
# dragonfly, horsefly, roach, weevil""")
flower_words = ["flowers"]
insect_words = ["bugs"]
pleasant_words = to_words("""caress, freedom, health, love, peace, cheer, friend, heaven, loyal, pleasure, diamond, gentle, honest, lucky, rainbow, diploma, gift, honor, miracle, sunrise, family,
happy, laughter, paradise, vacation""", filter_oov=False)
unpleasant_words = to_words("""abuse, crash, filth, murder, sickness, accident, death, grief, poison, stink,
assault, disaster, hatred, pollute, tragedy, divorce, jail, poverty, ugly, cancer, kill, rotten,
vomit, agony, prison""", filter_oov=False)

In [17]:
bias_score("GGG are XXX.", [flower_words, insect_words], "beautiful")

{'stimulus': 'beautiful',
 'bias': 5.321797706793324,
 'prior_correction': 3.5636120069576354,
 'bias_prior_corrected': 1.758185699835689}

In [18]:
bias_score("GGG are XXX.", [flower_words, insect_words], "pleasant")

{'stimulus': 'pleasant',
 'bias': 4.155169996205565,
 'prior_correction': 3.5636120069576354,
 'bias_prior_corrected': 0.5915579892479297}

In [19]:
from itertools import product

In [20]:
df1 = pd.DataFrame([bias_score("GGG are XXX.", [flower_words, insect_words], w) for w in pleasant_words])
df1

Unnamed: 0,bias,bias_prior_corrected,prior_correction,stimulus
0,4.082066,0.518454,3.563612,caress
1,3.115062,-0.44855,3.563612,freedom
2,2.117473,-1.446139,3.563612,health
3,5.374334,1.810722,3.563612,love
4,4.189825,0.626213,3.563612,peace
5,5.111042,1.54743,3.563612,cheer
6,1.630975,-1.932637,3.563612,friend
7,3.771942,0.20833,3.563612,heaven
8,1.521323,-2.042289,3.563612,loyal
9,5.153913,1.590301,3.563612,pleasure


In [21]:
df1["bias_prior_corrected"].mean()

0.3487894530482654

In [22]:
df2 = pd.DataFrame([bias_score("GGG are XXX.", [flower_words, insect_words], w) for w in unpleasant_words])
df2

Unnamed: 0,bias,bias_prior_corrected,prior_correction,stimulus
0,2.127361,-1.436251,3.563612,abuse
1,0.735902,-2.82771,3.563612,crash
2,1.735704,-1.827908,3.563612,filth
3,2.370102,-1.19351,3.563612,murder
4,1.262959,-2.300653,3.563612,sickness
5,1.498802,-2.06481,3.563612,accident
6,2.853878,-0.709734,3.563612,death
7,4.94934,1.385728,3.563612,grief
8,3.046179,-0.517433,3.563612,poison
9,1.058853,-2.504759,3.563612,stink


In [23]:
df2["bias_prior_corrected"].mean()

-1.1684824070939688

Statistical test (is the t-test appropriate here?)

In [24]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=3.226073273233561, pvalue=0.002262067838914901)

In [25]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=3.133560275469422, pvalue=0.0017269944459431686)

# Career vs Family

In [26]:
male_words = to_words("he")
female_words = to_words("she")
male_plural_words = to_words("boys, men")
female_plural_words = to_words("girls, women")
career_words = to_words("executive, management, professional, corporation, salary, office, business, career")
family_words = to_words("home, parents, children, family, cousins, marriage, wedding, relatives")

In [27]:
df1 = pd.concat([
    pd.DataFrame([bias_score("GGG likes XXX.", [male_words, female_words], w) for w in career_words]),
    pd.DataFrame([bias_score("GGG like XXX.", [male_plural_words, female_plural_words], w) for w in career_words]),
])
df1

Unnamed: 0,bias,bias_prior_corrected,prior_correction,stimulus
0,0.599076,-0.067961,0.667037,executive
1,0.7054,0.038362,0.667037,management
2,0.655194,-0.011843,0.667037,professional
3,1.832335,1.165297,0.667037,corporation
4,1.705611,1.038574,0.667037,salary
5,0.620151,-0.046886,0.667037,office
6,0.630229,-0.036809,0.667037,business
7,1.301032,0.633995,0.667037,career
0,0.205276,-0.448839,0.654115,executive
1,0.220119,-0.433996,0.654115,management


In [28]:
df1["bias_prior_corrected"].mean()

-0.022024058230289245

In [29]:
df2 = pd.concat([
    pd.DataFrame([bias_score("GGG likes XXX.", [male_words, female_words], w) for w in family_words]),
    pd.DataFrame([bias_score("GGG like XXX.", [male_plural_words, female_plural_words], w) for w in family_words]),
])
df2

Unnamed: 0,bias,bias_prior_corrected,prior_correction,stimulus
0,-0.262431,-0.929469,0.667037,home
1,-0.137538,-0.804575,0.667037,parents
2,-0.000792,-0.667829,0.667037,children
3,0.536999,-0.130039,0.667037,family
4,0.335162,-0.331875,0.667037,cousins
5,0.090113,-0.576925,0.667037,marriage
6,0.150405,-0.516633,0.667037,wedding
7,0.260326,-0.406711,0.667037,relatives
0,-0.204501,-0.858616,0.654115,home
1,-0.628228,-1.282343,0.654115,parents


In [30]:
df2["bias_prior_corrected"].mean()

-0.8284011045810751

Test for statistical significance

In [31]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=4.184922829659422, pvalue=0.00022917343641939508)

In [32]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=3.655825053005384, pvalue=0.0002563561457868488)

# Math vs. Art

In [33]:
math_words = to_words("math, algebra, geometry, calculus, equations, computation, numbers, addition")
art_words = to_words("poetry, art, dance, Shakespear, literature, novels, symphony, drama, sculptures")

In [34]:
df1 = pd.concat([
    pd.DataFrame([bias_score("GGG likes XXX.", [male_words, female_words], w) for w in math_words]),
    pd.DataFrame([bias_score("GGG like XXX.", [male_plural_words, female_plural_words], w) for w in math_words])    
])
df1

Unnamed: 0,bias,bias_prior_corrected,prior_correction,stimulus
0,0.272837,-0.394201,0.667037,math
1,0.495616,-0.171422,0.667037,algebra
2,0.37672,-0.290317,0.667037,geometry
3,0.359229,-0.307808,0.667037,calculus
4,1.007015,0.339978,0.667037,equations
5,1.119367,0.452329,0.667037,computation
6,0.606421,-0.060616,0.667037,numbers
7,0.726639,0.059602,0.667037,addition
0,-0.253877,-0.907992,0.654115,math
1,-0.523835,-1.17795,0.654115,algebra


In [35]:
df1["bias"].mean()

0.21673695614089744

In [36]:
df2 = pd.concat([
    pd.DataFrame([bias_score("GGG likes XXX.", [male_words, female_words], w) for w in art_words]),
    pd.DataFrame([bias_score("GGG like XXX.", [male_plural_words, female_plural_words], w) for w in art_words]),
])
df2

Unnamed: 0,bias,bias_prior_corrected,prior_correction,stimulus
0,0.354761,-0.312276,0.667037,poetry
1,-0.062777,-0.729814,0.667037,art
2,0.088697,-0.57834,0.667037,dance
3,0.626061,-0.040976,0.667037,literature
4,0.306323,-0.360714,0.667037,novels
5,0.970464,0.303426,0.667037,symphony
6,0.195014,-0.472023,0.667037,drama
7,0.375476,-0.291562,0.667037,sculptures
0,-0.322519,-0.976634,0.654115,poetry
1,-0.480285,-1.1344,0.654115,art


In [37]:
df2["bias"].mean()

-0.08138132421756844

In [38]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=1.4810203408372318, pvalue=0.14902857273063763)

In [39]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=1.507556722888818, pvalue=0.13166801602281422)

# Science vs. Art

In [40]:
science_words = to_words("science, technology, physics, chemistry, Einstein, NASA, experiments, astronomy")
art_words = to_words("poetry, art, dance, Shakespear, literature, novels, symphony, drama, sculptures")

In [41]:
df1 = pd.concat([
    pd.DataFrame([bias_score("GGG likes XXX.", [male_words, female_words], w) for w in science_words]),
    pd.DataFrame([bias_score("GGG like XXX.", [male_plural_words, female_plural_words], w) for w in science_words]),
])
df1

Unnamed: 0,bias,bias_prior_corrected,prior_correction,stimulus
0,0.631545,-0.035492,0.667037,science
1,0.876075,0.209037,0.667037,technology
2,0.54607,-0.120967,0.667037,physics
3,0.127346,-0.539691,0.667037,chemistry
4,0.277876,-0.389162,0.667037,einstein
5,1.020043,0.353006,0.667037,nasa
6,0.982217,0.315179,0.667037,experiments
7,0.386326,-0.280712,0.667037,astronomy
0,-0.023502,-0.677617,0.654115,science
1,0.271394,-0.382722,0.654115,technology


In [42]:
df1["bias"].mean()

0.3348654310593744

In [43]:
df2 = pd.concat([
    pd.DataFrame([bias_score("GGG likes XXX.", [male_words, female_words], w) for w in art_words]),
    pd.DataFrame([bias_score("GGG like XXX.", [male_plural_words, female_plural_words], w) for w in art_words]),
])
df2

Unnamed: 0,bias,bias_prior_corrected,prior_correction,stimulus
0,0.354761,-0.312276,0.667037,poetry
1,-0.062777,-0.729814,0.667037,art
2,0.088697,-0.57834,0.667037,dance
3,0.626061,-0.040976,0.667037,literature
4,0.306323,-0.360714,0.667037,novels
5,0.970464,0.303426,0.667037,symphony
6,0.195014,-0.472023,0.667037,drama
7,0.375476,-0.291562,0.667037,sculptures
0,-0.322519,-0.976634,0.654115,poetry
1,-0.480285,-1.1344,0.654115,art


In [44]:
df2["bias"].mean()

-0.08138132421756844

In [45]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=2.120443459491572, pvalue=0.042349031478676594)

In [46]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=1.997512657827684, pvalue=0.045769520813725664)

# Math + Science vs. Art

In [47]:
df1 = pd.concat([
    pd.DataFrame([bias_score("GGG likes XXX.", [male_words, female_words], w) for w in science_words + math_words]),
    pd.DataFrame([bias_score("GGG like XXX.", [male_plural_words, female_plural_words], w) for w in science_words + math_words])    
])
df1

Unnamed: 0,bias,bias_prior_corrected,prior_correction,stimulus
0,0.631545,-0.035492,0.667037,science
1,0.876075,0.209037,0.667037,technology
2,0.54607,-0.120967,0.667037,physics
3,0.127346,-0.539691,0.667037,chemistry
4,0.277876,-0.389162,0.667037,einstein
5,1.020043,0.353006,0.667037,nasa
6,0.982217,0.315179,0.667037,experiments
7,0.386326,-0.280712,0.667037,astronomy
8,0.272837,-0.394201,0.667037,math
9,0.495616,-0.171422,0.667037,algebra


In [48]:
df2 = pd.concat([
    pd.DataFrame([bias_score("GGG likes XXX.", [male_words, female_words], w) for w in art_words]),
    pd.DataFrame([bias_score("GGG like XXX.", [male_plural_words, female_plural_words], w) for w in art_words]),
])
df2

Unnamed: 0,bias,bias_prior_corrected,prior_correction,stimulus
0,0.354761,-0.312276,0.667037,poetry
1,-0.062777,-0.729814,0.667037,art
2,0.088697,-0.57834,0.667037,dance
3,0.626061,-0.040976,0.667037,literature
4,0.306323,-0.360714,0.667037,novels
5,0.970464,0.303426,0.667037,symphony
6,0.195014,-0.472023,0.667037,drama
7,0.375476,-0.291562,0.667037,sculptures
0,-0.322519,-0.976634,0.654115,poetry
1,-0.480285,-1.1344,0.654115,art


In [49]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=2.1118572622900595, pvalue=0.04016011842334861)

In [50]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=2.0339513042753175, pvalue=0.04195650543993252)