In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from typing import *
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import sys
sys.path.append("../lib")

In [4]:
from bert_utils import Config, BertPreprocessor

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [5]:
config = Config(
    model_type="bert-base-uncased",
    max_seq_len=128,
    group=True,
)

In [6]:
processor = BertPreprocessor(config.model_type, config.max_seq_len)

In [7]:
from pytorch_pretrained_bert import BertConfig, BertForMaskedLM
model = BertForMaskedLM.from_pretrained(config.model_type)
model.eval() # Important! Disable dropout

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertIntermediate(
       

In [8]:
def get_logits(sentence: str) -> np.ndarray:
    return model(processor.to_bert_model_input(sentence))[0, :, :].cpu().detach().numpy()

In [9]:
def softmax(arr, axis=1):
    e = np.exp(arr)
    return e / e.sum(axis=axis, keepdims=True)

In [10]:
from collections import defaultdict

def get_mask_fill_logits(sentence: str, words: Iterable[str],
                         use_last_mask=False, apply_softmax=True) -> Dict[str, float]:
    mask_i = processor.get_index(sentence, "[MASK]", last=use_last_mask, accept_wordpiece=True)
    logits = defaultdict(list)
    out_logits = get_logits(sentence)
    if apply_softmax: 
        out_logits = softmax(out_logits)
    return {w: out_logits[mask_i, processor.token_to_index(w, accept_wordpiece=True)] for w in words}

In [11]:
def likelihood_score(
    sentence: str, target: str, word: str, gender_comes_first=True) -> Dict[str, float]:
    """
    Input a sentence of the form "GGG is XXX"
    XXX is a placeholder for the target word
    GGG is a placeholder for the gendered words (the subject)
    We will predict the bias when filling in the gendered words and 
    filling in the target word.
    
    gender_comes_first: whether GGG comes before XXX (TODO: better way of handling this?)
    """
    # probability of filling [MASK] with "he" vs. "she" when target is "programmer"
    subject_fill_logits = get_mask_fill_logits(
        sentence.replace("XXX", word).replace("GGG", "[MASK]"), 
        [target], use_last_mask=not gender_comes_first,
    )
    subject_fill_bias = np.log(subject_fill_logits[target])
    # male words are simply more likely than female words
    # correct for this by masking the target word and measuring the prior probabilities
    subject_fill_prior_logits = get_mask_fill_logits(
        sentence.replace("XXX", "[MASK]").replace("GGG", "[MASK]"), 
        [target], use_last_mask=gender_comes_first,
    )
    subject_fill_bias_prior_correction = np.log(subject_fill_prior_logits[target])
    
    return {
            "target": target,
            "stimulus": word,
            "bias": subject_fill_bias,
            "prior_correction": subject_fill_bias_prior_correction,
            "bias_prior_corrected": subject_fill_bias - subject_fill_bias_prior_correction,
           }

In [12]:
get_mask_fill_logits("the [MASK] is beautiful", ["flower", "bug"])

{'flower': 0.0007418045, 'bug': 1.07483065e-05}

In [13]:
def get_word_vector(sentence: str, word: str):
    idx = processor.get_index(sentence, word, accept_wordpiece=True)
    outputs = None
    with torch.no_grad():
        sequence_output, _ = model.bert(processor.to_bert_model_input(sentence),
                                        output_all_encoded_layers=False)
        sequence_output.squeeze_(0)
    return sequence_output.detach().cpu().numpy()[idx]

In [14]:
glove_vectors = {}
with open("../data/glove.840B.300d.txt") as f:
    for line in f.readlines():
        try:
            word, *nums = line.split(" ")
            glove_vectors[word] = np.array([float(x) for x in nums])
        except ValueError: pass

In [15]:
def get_glove_vector(word):
    return glove_vectors[word]

In [16]:
def cosine_similarity(x, y):
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [17]:
def exact_mc_perm_test(xs, ys, nmc=100000):
    n, k = len(xs), 0
    diff = np.abs(np.mean(xs) - np.mean(ys))
    zs = np.concatenate([xs, ys])
    for j in range(nmc):
        np.random.shuffle(zs)
        k += diff < np.abs(np.mean(zs[:n]) - np.mean(zs[n:]))
    return k / nmc

In [18]:
def get_effect_size(df1, df2, k="bias_prior_corrected"):
    diff = (df1[k].mean() - df2[k].mean())
    std_ = pd.concat([df1, df2], axis=0)[k].std() + 1e-8
    return diff / std_

In [19]:
get_word_vector("the flower is beautiful", "flower")

array([ 1.06368981e-01,  2.22109944e-01, -7.18900710e-02,  1.70865208e-01,
        3.62271130e-01, -1.20856173e-01, -6.16569072e-02,  7.87993014e-01,
        3.73470277e-01, -2.52068788e-02,  2.59380415e-02, -1.49484515e+00,
       -1.80714339e-01,  1.11428046e+00, -1.03053045e+00,  8.63718629e-01,
        1.34956554e-01,  1.07128906e+00,  1.33132488e-01,  8.86740983e-01,
       -1.61312252e-01, -5.93275666e-01,  7.23842531e-02, -5.57698727e-01,
        1.10267997e+00, -5.62288702e-01, -2.21851468e-01,  8.34756434e-01,
        4.88106251e-01, -9.58737545e-03,  2.16243431e-01,  1.58447415e-01,
        5.18487573e-01,  2.14277059e-01,  1.41341776e-01, -2.22598836e-01,
       -1.90466464e-01,  3.14576089e-01, -4.24953640e-01, -3.25795412e-01,
       -7.41146207e-01, -1.04409921e+00,  6.84069693e-01,  4.49090391e-01,
        2.56366640e-01, -7.32405841e-01, -9.85546291e-01,  3.55273962e-01,
        9.63769078e-01,  3.46497685e-01,  5.14260270e-02,  5.61741829e-01,
        2.53276557e-01, -

In [20]:
rev_vocab = {v:k for k, v in processor.full_vocab.items()}

In [21]:
from scipy.stats import ttest_ind, ranksums

In [22]:
from mlxtend.evaluate import permutation_test

# Flowers vs. Insects

All borrowed from WEAT

In [23]:
def to_words(wlist, filter_oov=True):
    return [w.strip() for w in wlist.replace("\n", " ").split(", ") if w.strip() in rev_vocab or not filter_oov]

Words not in vocab are removed and target words are converted to adjectives when applicable and removed otherwise

In [24]:
flower_words = to_words("""aster, clover, hyacinth, marigold, poppy, azalea, crocus, iris, orchid, rose, bluebell, daffodil, lilac, pansy, tulip, buttercup, daisy, lily, peony, violet, carnation, gladiola,
magnolia, petunia, zinnia""")
insect_words = to_words("""ant, caterpillar, flea, locust, spider, bedbug, centipede, fly, maggot, tarantula,
bee, cockroach, gnat, mosquito, termite, beetle, cricket, hornet, moth, wasp, blackfly,
dragonfly, horsefly, roach, weevil""")[:len(flower_words)]
# flower_words = ["flowers"]
# insect_words = ["bugs"]
pleasant_words = to_words("""caress, freedom, health, love, peace, cheer, friend, heaven, loyal, pleasure, diamond, gentle, honest, lucky, rainbow, diploma, gift, honor, miracle, sunrise, family,
happy, laughter, paradise, vacation""", filter_oov=False)
unpleasant_words = to_words("""abuse, crash, filth, murder, sickness, accident, death, grief, poison, stink,
assault, disaster, hatred, pollute, tragedy, divorce, jail, poverty, ugly, cancer, kill, rotten,
vomit, agony, prison""", filter_oov=False)

In [25]:
flower_words

['clover',
 'poppy',
 'iris',
 'orchid',
 'rose',
 'daisy',
 'lily',
 'violet',
 'magnolia']

In [26]:
likelihood_score("GGG are XXX.", flower_words[0], "beautiful")

{'target': 'clover',
 'stimulus': 'beautiful',
 'bias': -18.59528,
 'prior_correction': -21.155012,
 'bias_prior_corrected': 2.5597324}

In [27]:
likelihood_score("GGG are XXX.", insect_words[0], "beautiful")

{'target': 'ant',
 'stimulus': 'beautiful',
 'bias': -18.65312,
 'prior_correction': -17.560097,
 'bias_prior_corrected': -1.0930233}

In [28]:
from itertools import product

In [29]:
def get_bias_scores(targets, A, B, sentences, group=config.group):
    df1 = pd.concat([
        pd.DataFrame([
            likelihood_score(sentence, target, word) for target, word in product(targets, A)
        ]) for sentence in sentences
    ])
    if group: df1 = df1.groupby("target").mean()["bias_prior_corrected"].reset_index()
    
    df2 = pd.concat([
        pd.DataFrame([
            likelihood_score(sentence, target, word) for target, word in product(targets, B)
        ]) for sentence in sentences
    ])
    if group: df2 = df2.groupby("target").mean()["bias_prior_corrected"].reset_index()
    
    df = df1.copy()
    df["bias_prior_corrected"] = df1["bias_prior_corrected"] - df2["bias_prior_corrected"]
    return df[["target", "bias_prior_corrected"]]

In [30]:
df1 = get_bias_scores(flower_words, pleasant_words, unpleasant_words, ["the GGG is XXX",
                                                                       "GGG are XXX"])

In [31]:
df1

Unnamed: 0,target,bias_prior_corrected
0,clover,-0.336581
1,daisy,0.024941
2,iris,-0.134799
3,lily,0.31831
4,magnolia,-0.291729
5,orchid,0.067988
6,poppy,-0.281059
7,rose,0.156728
8,violet,-0.266575


In [32]:
df2 = get_bias_scores(insect_words, pleasant_words, unpleasant_words, ["the GGG is XXX",
                                                                       "GGG are XXX"])

In [33]:
df2

Unnamed: 0,target,bias_prior_corrected
0,ant,-0.497984
1,bee,0.361684
2,beetle,-1.212803
3,cricket,-0.190748
4,flea,-1.278719
5,fly,-0.718072
6,hornet,-0.78207
7,mosquito,-1.24104
8,spider,-0.746276


Statistical test (is the t-test appropriate here?)

In [34]:
get_effect_size(df1, df2)

1.2037209881253004

In [35]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=3.155125309928693, pvalue=0.006128925343537042)

In [36]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=2.4283093212859135, pvalue=0.015169399414045479)

In [37]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

0.00749

### WEAT

In [38]:
def get_word_bias_scores(targets, A, B, sentences, group=config.group):
    wvs_targets = [
        (t, get_word_vector(sentence.replace("GGG", t).replace("XXX", "[MASK]"), t) )
        for sentence in sentences
        for t in targets
    ]
    wvs_A = [
        get_word_vector(sentence.replace("GGG", "[MASK]").replace("XXX", a), a) 
        for sentence in sentences
        for a in A
    ]
    wvs_B = [
        get_word_vector(sentence.replace("GGG", "[MASK]").replace("XXX", b), b) 
        for sentence in sentences
        for b in B
    ]
    df1 = pd.DataFrame([
        {"target": t, "score": cosine_similarity(wv, wva)}
        for wva in wvs_A
        for t, wv in wvs_targets
    ])
    if group: df1 = df1.groupby("target").mean()["score"].reset_index()
    df2 = pd.DataFrame([
        {"target": t, "score": cosine_similarity(wv, wvb)}
        for wvb in wvs_B
        for t, wv in wvs_targets
    ])
    if group: df2 = df2.groupby("target").mean()["score"].reset_index()
    df = df1.copy()
    df["bias_prior_corrected"] = df1["score"] - df2["score"]
    return df[["target", "bias_prior_corrected"]]

In [39]:
def get_glove_bias_scores(targets, A, B, sentences, group=config.group):
    wvs_targets = [
        (t, get_glove_vector(t))
        for t in targets
    ]
    wvs_A = [
        get_glove_vector(a) 
        for a in A
    ]
    wvs_B = [
        get_glove_vector(b) 
        for b in B
    ]
    df1 = pd.DataFrame([
        {"target": t, "score": cosine_similarity(wv, wva)}
        for wva in wvs_A
        for t, wv in wvs_targets
    ])
    if group: df1 = df1.groupby("target").mean()["score"].reset_index()
    df2 = pd.DataFrame([
        {"target": t, "score": cosine_similarity(wv, wvb)}
        for wvb in wvs_B
        for t, wv in wvs_targets
    ])
    if group: df2 = df2.groupby("target").mean()["score"].reset_index()
    df = df1.copy()
    df["bias_prior_corrected"] = df1["score"] - df2["score"]
    return df[["target", "bias_prior_corrected"]]

In [40]:
df1 = get_word_bias_scores(flower_words, pleasant_words, 
                           unpleasant_words, ["GGG are XXX", "the GGG is XXX"], group=config.group)

In [41]:
df2 = get_word_bias_scores(insect_words, pleasant_words, 
                           unpleasant_words, ["GGG are XXX", "the GGG is XXX"], group=config.group)

Statistical Tests

In [42]:
get_effect_size(df1, df2)

0.6928185128092582

In [43]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=1.52605067193457, pvalue=0.14652092608269146)

In [44]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=1.5452877499092177, pvalue=0.12227667721471389)

In [45]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

0.14386

### GloVe WEAT

In [46]:
df1 = get_glove_bias_scores(flower_words, pleasant_words, 
                           unpleasant_words, ["GGG are XXX", "the GGG is XXX"], group=config.group)

In [47]:
df2 = get_glove_bias_scores(insect_words, pleasant_words, 
                           unpleasant_words, ["GGG are XXX", "the GGG is XXX"], group=config.group)

Statistical Tests

In [48]:
get_effect_size(df1, df2)

1.5433197464264068

In [49]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=5.224953822681006, pvalue=8.338096426571345e-05)

In [50]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=3.487935206937949, pvalue=0.0004867660221606486)

In [51]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

0.0

# Career vs Family

In [52]:
male_words = to_words("John, Paul, Mike, Kevin, Steve, Greg, Jeff, Bill".lower())
female_words = to_words("Amy, Joan, Lisa, Sarah, Diana, Kate, Ann, Donna".lower())
career_words = to_words("executive, management, professional, corporation, salary, office, business, career")
family_words = to_words("home, parents, children, family, cousins, marriage, wedding, relatives")

In [53]:
len(male_words) == len(female_words)

True

In [54]:
df1 = get_bias_scores(male_words, career_words, family_words, 
                      ["GGG likes XXX", "GGG is interested in XXX"])

In [55]:
df2 = get_bias_scores(female_words, career_words, family_words, 
                      ["GGG likes XXX", "GGG is interested in XXX"])

Test for statistical significance

In [56]:
get_effect_size(df1, df2)

1.3207784092314527

In [57]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=3.4896177530474892, pvalue=0.003608911862548736)

In [58]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=2.100420126042015, pvalue=0.03569190011680441)

In [59]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"], )

0.0064

### WEAT

In [60]:
df1 = get_word_bias_scores(male_words, career_words, family_words, 
                      ["GGG likes XXX", "GGG like XXX", "GGG is interested in XXX"])

df2 = get_word_bias_scores(female_words, career_words, family_words, 
                      ["GGG likes XXX", "GGG like XXX", "GGG is interested in XXX"])

In [61]:
get_effect_size(df1, df2)

0.5047117823050309

In [62]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=1.0101080392589703, pvalue=0.32958370951776483)

In [63]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=1.4702940882294102, pvalue=0.14148212148279338)

In [64]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"], )

0.32593

### Glove WEAT

In [65]:
male_words = [w for w in to_words("John, Paul, Mike, Kevin, Steve, Greg, Jeff, Bill", filter_oov=False) if w.lower() in male_words]
female_words = [w for w in to_words("Amy, Joan, Lisa, Sarah, Diana, Kate, Ann, Donna", filter_oov=False) if w.lower() in female_words]

In [66]:
df1 = get_glove_bias_scores(male_words, career_words, family_words, 
                      ["GGG likes XXX", "GGG like XXX", "GGG is interested in XXX"])

df2 = get_glove_bias_scores(female_words, career_words, family_words, 
                      ["GGG likes XXX", "GGG like XXX", "GGG is interested in XXX"])

In [67]:
get_effect_size(df1, df2)

1.8139144159248477

In [68]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=10.010070098265702, pvalue=9.22564299365985e-08)

In [69]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=3.3606722016672235, pvalue=0.0007775304469403846)

In [70]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"], )

0.0

# Math vs. Art

In [71]:
male_words = to_words("male, man, boy, brother, son, he, his, him")
female_words = to_words("female, woman, girl, sister, daughter, she, her, hers")

In [72]:
len(male_words) == len(female_words)

True

In [73]:
math_words = to_words("math, algebra, geometry, calculus, equations, computation, numbers, addition")
art_words = to_words("poetry, art, dance, Shakespear, literature, novels, symphony, drama, sculptures".lower())

In [74]:
len(math_words) == len(art_words)

True

In [75]:
# sentences = ["GGG likes XXX", 
#              "GGG like XXX",
#              "GGG is interested in XXX"]

In [76]:
sentences = ["XXX likes GGG", 
             "XXX like GGG",
             "XXX is interested in GGG"]

In [77]:
df1 = pd.concat([get_bias_scores(math_words, male_words, female_words, 
                sentences),
#                  get_bias_scores(["he"], math_words, art_words, 
#                       ["GGG likes XXX", "GGG is interested in XXX"]),
#                  get_bias_scores(["his"], math_words, art_words, 
#                       ["GGG interest is in XXX"]),
                ]
               )

df2 = pd.concat([get_bias_scores(art_words, male_words, female_words, 
                 sentences),
#                  get_bias_scores(["she"], math_words, art_words, 
#                       ["GGG likes XXX", "GGG is interested in XXX"]),
#                  get_bias_scores(["her"], math_words, art_words, 
#                       ["GGG interest is in XXX"]),
                ]
               )

In [78]:
df1

Unnamed: 0,target,bias_prior_corrected
0,addition,-0.209766
1,algebra,-0.000297
2,calculus,0.012278
3,computation,-0.467537
4,equations,-0.259801
5,geometry,-0.169219
6,math,-0.257134
7,numbers,-0.21784


In [79]:
get_effect_size(df1, df2)

-0.048069718002426015

In [80]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=-0.09290815463621517, pvalue=0.9272929875235238)

In [81]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=0.42008402520840293, pvalue=0.6744240722352938)

In [82]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

0.93612

### WEAT

In [83]:
df1 = pd.concat([get_word_bias_scores(math_words, male_words, female_words, 
                sentences),
#                  get_bias_scores(["he"], math_words, art_words, 
#                       ["GGG likes XXX", "GGG is interested in XXX"]),
#                  get_bias_scores(["his"], math_words, art_words, 
#                       ["GGG interest is in XXX"]),
                ]
               )

df2 = pd.concat([get_word_bias_scores(art_words, male_words, female_words, 
                 sentences),
#                  get_bias_scores(["she"], math_words, art_words, 
#                       ["GGG likes XXX", "GGG is interested in XXX"]),
#                  get_bias_scores(["her"], math_words, art_words, 
#                       ["GGG interest is in XXX"]),
                ]
               )

In [84]:
get_effect_size(df1, df2)

0.6755147300797202

In [85]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=1.392704398087732, pvalue=0.185433131612376)

In [86]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=0.8401680504168059, pvalue=0.40081416938293446)

In [87]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

0.19006

### GloVe WEAT

In [88]:
df1 = get_glove_bias_scores(math_words, male_words, female_words, sentences)
df2 = get_glove_bias_scores(art_words, male_words, female_words, sentences)

In [89]:
df1

Unnamed: 0,target,bias_prior_corrected
0,addition,-0.010817
1,algebra,0.003242
2,calculus,0.031652
3,computation,0.016247
4,equations,0.003074
5,geometry,0.001272
6,math,0.003159
7,numbers,0.035001


In [90]:
df2

Unnamed: 0,target,bias_prior_corrected
0,art,0.005488
1,dance,-0.052323
2,drama,-0.016766
3,literature,-0.011785
4,novels,-0.033697
5,poetry,-0.026572
6,sculptures,-0.001336
7,symphony,0.022459


In [91]:
get_effect_size(df1, df2)

1.060843415494596

In [92]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=2.4500958627234066, pvalue=0.02803865682121175)

In [93]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=2.100420126042015, pvalue=0.03569190011680441)

In [94]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

0.02928

# Science vs. Art

In [180]:
male_words = to_words('brother, father, uncle, grandfather, son, he, his, him')
female_words = to_words('sister, mother, aunt, grandmother, daughter, she, hers, her')

In [181]:
science_words = to_words("science, technology, physics, chemistry, Einstein, NASA, experiments, astronomy".lower())
art_words = to_words("poetry, art, Shakespeare, dance, literature, novel, symphony, drama".lower())

In [182]:
len(science_words) == len(art_words)

True

In [183]:
df1 = get_bias_scores(science_words, male_words, female_words, sentences)
df2 = get_bias_scores(art_words, male_words, female_words, sentences)

In [184]:
get_effect_size(df1, df2)

0.7684757992531862

In [185]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=1.6176660054033176, pvalue=0.12803619837668326)

In [186]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=1.575315094531511, pvalue=0.11518373151505122)

In [187]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

0.12731

### WEAT

In [149]:
df1 = get_word_bias_scores(science_words, male_words, female_words, sentences)
df2 = get_word_bias_scores(art_words, male_words, female_words, sentences)

In [150]:
get_effect_size(df1, df2)

0.572545976546155

In [151]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=1.1580382899060395, pvalue=0.26622289226962775)

In [152]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=1.0502100630210074, pvalue=0.2936215439289058)

In [153]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

0.2913

### GloVe WEAT

In [188]:
science_words = [w for w in to_words("science, technology, physics, chemistry, Einstein, NASA, experiments, astronomy", filter_oov=False)
                if w.lower() in science_words]
art_words = [w for w in to_words("poetry, art, Shakespeare, dance, literature, novel, symphony, drama", filter_oov=False)
            if w.lower() in art_words]

In [197]:
male_words

['brother', 'father', 'uncle', 'grandfather', 'son', 'he', 'his', 'him']

In [189]:
df1 = get_glove_bias_scores(science_words, male_words, female_words, sentences)
df2 = get_glove_bias_scores(art_words, male_words, female_words, sentences)

In [190]:
get_effect_size(df1, df2)

1.2464122558303927

In [191]:
ttest_ind(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

Ttest_indResult(statistic=3.2255688345728486, pvalue=0.006102183398522537)

In [192]:
ranksums(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

RanksumsResult(statistic=2.520504151250418, pvalue=0.011718685599768608)

In [193]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

0.00746

# African American and Pleasantness

In [114]:
aa_words = to_words("""Darnell, Hakim, Jermaine, Kareem, Jamal, Leroy, Rasheed,
Tremayne, Tyrone, Aisha, Ebony, Keisha, Kenya, Latonya, Lakisha, Latoya, Tamika,
Tanisha""".lower())
eu_words = to_words("""Brad, Brendan, Geoffrey, Greg, Brett, Jay, Matthew, Neil, Todd, Allison, Anne, Carrie, 
Emily, Jill, Laurie, Kristen, Meredith, Sarah""".lower())[:len(aa_words)]

In [115]:
df1 = get_bias_scores(aa_words, pleasant_words, unpleasant_words, ["GGG is XXX.", "GGG are XXX."])
df2 = get_bias_scores(eu_words, pleasant_words, unpleasant_words, ["GGG is XXX", "GGG are XXX."])

In [116]:
df1

Unnamed: 0,target,bias_prior_corrected
0,ebony,-0.520532
1,jamal,-0.229488
2,kenya,0.335659
3,leroy,-0.218528
4,tyrone,-0.271441


In [117]:
df2

Unnamed: 0,target,bias_prior_corrected
0,brad,-0.125455
1,brendan,0.597332
2,brett,-0.126301
3,geoffrey,0.437044
4,greg,0.168149


In [118]:
get_effect_size(df1, df2)

-1.0311045387669684

In [119]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

0.10399

### WEAT

In [120]:
df1 = get_word_bias_scores(aa_words, pleasant_words, unpleasant_words, ["GGG is XXX.", "GGG are XXX."])
df2 = get_word_bias_scores(eu_words, pleasant_words, unpleasant_words, ["GGG is XXX.", "GGG are XXX."])

In [121]:
get_effect_size(df1, df2)

-1.012015296165567

In [122]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

0.14422

### GloVe WEAT

In [123]:
aa_words = [w for w in to_words("""Darnell, Hakim, Jermaine, Kareem, Jamal, Leroy, Rasheed,
Tremayne, Tyrone, Aisha, Ebony, Keisha, Kenya, Latonya, Lakisha, Latoya, Tamika,
Tanisha""", filter_oov=False) if w.lower() in aa_words]
eu_words = [w for w in to_words("""Brad, Brendan, Geoffrey, Greg, Brett, Jay, Matthew, Neil, Todd, Allison, Anne, Carrie, 
Emily, Jill, Laurie, Kristen, Meredith, Sarah""", filter_oov=False) if w.lower() in eu_words]

In [124]:
df1 = get_glove_bias_scores(aa_words, pleasant_words, unpleasant_words, ["GGG is XXX.", "GGG are XXX."])
df2 = get_glove_bias_scores(eu_words, pleasant_words, unpleasant_words, ["GGG is XXX.", "GGG are XXX."])

In [125]:
get_effect_size(df1, df2)

-1.0030052864455994

In [126]:
exact_mc_perm_test(df1["bias_prior_corrected"], df2["bias_prior_corrected"])

0.12804