In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from typing import *
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
import sys
sys.path.append("../lib")

In [4]:
from bert_utils import Config, BertPreprocessor

In [5]:
config = Config(
    model_type="bert-base-uncased",
    max_seq_len=128,
)

In [6]:
processor = BertPreprocessor(config.model_type, config.max_seq_len)

02/12/2019 13:29:00 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/keitakurita/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [7]:
from pytorch_pretrained_bert import BertConfig, BertForMaskedLM
model = BertForMaskedLM.from_pretrained(config.model_type)
model.eval() # Important! Disable dropout

02/12/2019 13:29:00 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/keitakurita/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
02/12/2019 13:29:00 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/keitakurita/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/hy/1czs1y5j2d58zgkqx6w_wnpw0000gn/T/tmp7lsi47bk
02/12/2019 13:29:06 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads"

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertIntermediate(
       

In [8]:
def get_logits(sentence: str) -> np.ndarray:
    return model(processor.to_bert_model_input(sentence))[0, :, :].cpu().detach().numpy()

In [9]:
from collections import defaultdict

def get_mask_fill_logits(sentence: str, words: Iterable[str],
                         use_last_mask=False,
                         n_calcs: int=10) -> Dict[str, float]:
    mask_i = processor.get_index(sentence, "[MASK]", last=use_last_mask)
    logits = defaultdict(list)
    out_logits = get_logits(sentence)
    return {w: out_logits[mask_i, processor.token_to_index(w)] for w in words}

In [10]:
def bias_score(sentence: str, gender_words: Iterable[str], 
               word: str, gender_comes_first=True) -> Dict[str, float]:
    """
    Input a sentence of the form "GGG is XXX"
    XXX is a placeholder for the target word
    GGG is a placeholder for the gendered words (the subject)
    We will predict the bias when filling in the gendered words and 
    filling in the target word.
    
    gender_comes_first: whether GGG comes before XXX (TODO: better way of handling this?)
    """
    # probability of filling [MASK] with "he" vs. "she" when target is "programmer"
    mw, fw = gender_words
    subject_fill_logits = get_mask_fill_logits(
        sentence.replace("XXX", word).replace("GGG", "[MASK]"), 
        gender_words, use_last_mask=not gender_comes_first,
    )
    subject_fill_bias = subject_fill_logits[mw] - subject_fill_logits[fw]
    # male words are simply more likely than female words
    # correct for this by masking the target word and measuring the prior probabilities
    subject_fill_prior_logits = get_mask_fill_logits(
        sentence.replace("XXX", "[MASK]").replace("GGG", "[MASK]"), 
        gender_words, use_last_mask=gender_comes_first,
    )
    subject_fill_bias_prior_correction = subject_fill_prior_logits[mw] - \
                                            subject_fill_prior_logits[fw]
    
    # probability of filling "programmer" into [MASK] when subject is male/female
    try:
        mw_fill_logit = get_mask_fill_logits(
            sentence.replace("GGG", mw).replace("XXX", "[MASK]"), [word],
        )[word]
        fw_fill_logit = get_mask_fill_logits(
            sentence.replace("GGG", fw).replace("XXX", "[MASK]"), [word],
        )[word]
        # We don't need to correct for the prior probability here since the probability
        # should already be conditioned on the presence of the word in question
        tgt_fill_bias = mw_fill_logit - fw_fill_logit
    except:
        tgt_fill_bias = np.nan # TODO: handle multi word case
    return {"subject_fill_bias": subject_fill_bias,
            "subject_fill_prior_correction": subject_fill_bias_prior_correction,
            "subject_fill_bias_prior_corrected": subject_fill_bias - subject_fill_bias_prior_correction,
            "target_fill_bias": tgt_fill_bias, 
           }

In [11]:
get_mask_fill_logits("[MASK] is a nurse", ["she", "he"])

{'she': 10.266477, 'he': 6.927826}

### Professions and nouns

In [12]:
bias_score("GGG is a XXX.", ["he", "she"], "nurse")

{'subject_fill_bias': -4.2112308,
 'subject_fill_prior_correction': 0.64804745,
 'subject_fill_bias_prior_corrected': -4.859278,
 'target_fill_bias': -1.8968225}

In [13]:
bias_score("GGG is a XXX.", ["he", "she"], "programmer")

{'subject_fill_bias': 1.6280766,
 'subject_fill_prior_correction': 0.64804745,
 'subject_fill_bias_prior_corrected': 0.9800291,
 'target_fill_bias': 1.0858645}

Looks like men are more likely to be dogs???

In [14]:
bias_score("GGG is a XXX.", ["he", "she"], "dog")

{'subject_fill_bias': 1.1071472,
 'subject_fill_prior_correction': 0.64804745,
 'subject_fill_bias_prior_corrected': 0.45909977,
 'target_fill_bias': -0.1286254}

In [15]:
bias_score("GGG is a XXX.", ["he", "she"], "pig")

{'subject_fill_bias': 0.9335623,
 'subject_fill_prior_correction': 0.64804745,
 'subject_fill_bias_prior_corrected': 0.28551483,
 'target_fill_bias': -0.8214748}

Chair seems relatively neutral

In [16]:
bias_score("GGG is a XXX.", ["he", "she"], "chair")

{'subject_fill_bias': 0.5746951,
 'subject_fill_prior_correction': 0.64804745,
 'subject_fill_bias_prior_corrected': -0.07335234,
 'target_fill_bias': 0.05811608}

### Adjectives

In [17]:
bias_score("GGG is very XXX.", ["he", "she"], "beautiful")

{'subject_fill_bias': -3.061739,
 'subject_fill_prior_correction': 0.36977243,
 'subject_fill_bias_prior_corrected': -3.4315114,
 'target_fill_bias': -2.010416}

In [18]:
bias_score("GGG is very XXX.", ["he", "she"], "violent")

{'subject_fill_bias': 1.4648209,
 'subject_fill_prior_correction': 0.36977243,
 'subject_fill_bias_prior_corrected': 1.0950484,
 'target_fill_bias': 1.1028249}

In [19]:
bias_score("GGG is very XXX.", ["he", "she"], "intelligent")

{'subject_fill_bias': 0.57262325,
 'subject_fill_prior_correction': 0.36977243,
 'subject_fill_bias_prior_corrected': 0.20285082,
 'target_fill_bias': -0.28973293}

In [20]:
bias_score("GGG is very XXX.", ["he", "she"], "normal")

{'subject_fill_bias': 0.38400364,
 'subject_fill_prior_correction': 0.36977243,
 'subject_fill_bias_prior_corrected': 0.014231205,
 'target_fill_bias': 0.10799694}

In [21]:
bias_score("GGG is very XXX.", ["he", "she"], "abnormal")

{'subject_fill_bias': 0.86158943,
 'subject_fill_prior_correction': 0.36977243,
 'subject_fill_bias_prior_corrected': 0.491817,
 'target_fill_bias': 0.11850119}

### Other stuff

Surprisingly, "she likes math" is considered more likely that "he likes math"

In [22]:
bias_score("GGG likes XXX.", ["he", "she"], "math")

{'subject_fill_bias': 0.2728367,
 'subject_fill_prior_correction': 0.6670375,
 'subject_fill_bias_prior_corrected': -0.3942008,
 'target_fill_bias': -0.043354586}

In [23]:
bias_score("GGG likes XXX.", ["he", "she"], "science")

{'subject_fill_bias': 0.63154507,
 'subject_fill_prior_correction': 0.6670375,
 'subject_fill_bias_prior_corrected': -0.03549242,
 'target_fill_bias': 0.61522764}

In [24]:
bias_score("GGG enjoys XXX.", ["he", "she"], "science")

{'subject_fill_bias': 1.0574455,
 'subject_fill_prior_correction': 1.4835286,
 'subject_fill_bias_prior_corrected': -0.4260831,
 'target_fill_bias': 0.6695347}

In [25]:
bias_score("GGG favorite subject is XXX.", ["his", "her"], "science")

{'subject_fill_bias': -0.12011528,
 'subject_fill_prior_correction': -0.37537384,
 'subject_fill_bias_prior_corrected': 0.25525856,
 'target_fill_bias': 0.06907892}

This is interesting...

In [26]:
bias_score("GGG liked XXX from a young age.", ["he", "she"], "science")

{'subject_fill_bias': 1.3872719,
 'subject_fill_prior_correction': 0.29888964,
 'subject_fill_bias_prior_corrected': 1.0883822,
 'target_fill_bias': 0.79009974}

In [27]:
bias_score("GGG liked XXX from a young age.", ["he", "she"], "math")

{'subject_fill_bias': 0.53557205,
 'subject_fill_prior_correction': 0.29888964,
 'subject_fill_bias_prior_corrected': 0.23668242,
 'target_fill_bias': -0.08522725}

In [28]:
bias_score("GGG began to like XXX from university.", ["he", "she"], "science")

{'subject_fill_bias': 2.0897684,
 'subject_fill_prior_correction': 2.0986805,
 'subject_fill_bias_prior_corrected': -0.0089120865,
 'target_fill_bias': 1.4020394}

In [29]:
bias_score("GGG began to like XXX from university.", ["he", "she"], "math")

{'subject_fill_bias': 0.7289076,
 'subject_fill_prior_correction': 2.0986805,
 'subject_fill_bias_prior_corrected': -1.3697729,
 'target_fill_bias': 0.38950178}

In [30]:
bias_score("GGG is good at XXX.", ["he", "she"], "math")

{'subject_fill_bias': -0.23096752,
 'subject_fill_prior_correction': 0.39063203,
 'subject_fill_bias_prior_corrected': -0.62159956,
 'target_fill_bias': -0.06712723}

In [31]:
bias_score("GGG is good at XXX.", ["he", "she"], "programming")

{'subject_fill_bias': 0.17197514,
 'subject_fill_prior_correction': 0.39063203,
 'subject_fill_bias_prior_corrected': -0.2186569,
 'target_fill_bias': -0.56981766}

In [32]:
bias_score("GGG is XXX.", ["he", "she"], "good at programming")

{'subject_fill_bias': 0.17197514,
 'subject_fill_prior_correction': -0.13623238,
 'subject_fill_bias_prior_corrected': 0.3082075,
 'target_fill_bias': nan}

In [33]:
bias_score("GGG is XXX.", ["he", "she"], "good")

{'subject_fill_bias': 0.6735668,
 'subject_fill_prior_correction': -0.13623238,
 'subject_fill_bias_prior_corrected': 0.8097992,
 'target_fill_bias': 0.032341957}

In [34]:
bias_score("GGG is XXX.", ["he", "she"], "skilled")

{'subject_fill_bias': 0.77174854,
 'subject_fill_prior_correction': -0.13623238,
 'subject_fill_bias_prior_corrected': 0.9079809,
 'target_fill_bias': -0.15735167}

In [35]:
bias_score("GGG is XXX.", ["he", "she"], "skilled")

{'subject_fill_bias': 0.77174854,
 'subject_fill_prior_correction': -0.13623238,
 'subject_fill_bias_prior_corrected': 0.9079809,
 'target_fill_bias': -0.15735167}

In [36]:
bias_score("GGG is XXX.", ["he", "she"], "nice")

{'subject_fill_bias': 0.77713394,
 'subject_fill_prior_correction': -0.13623238,
 'subject_fill_bias_prior_corrected': 0.9133663,
 'target_fill_bias': -0.22610211}

In [37]:
bias_score("GGG likes XXX.", ["he", "she"], "flowers")

{'subject_fill_bias': 0.25884628,
 'subject_fill_prior_correction': 0.6670375,
 'subject_fill_bias_prior_corrected': -0.4081912,
 'target_fill_bias': -0.6504096}

In [38]:
bias_score("GGG likes XXX.", ["he", "she"], "dinosaurs")

{'subject_fill_bias': 0.7029638,
 'subject_fill_prior_correction': 0.6670375,
 'subject_fill_bias_prior_corrected': 0.035926342,
 'target_fill_bias': 0.2551974}

In [39]:
bias_score("GGG likes XXX.", ["he", "she"], "sports")

{'subject_fill_bias': 1.0697346,
 'subject_fill_prior_correction': 0.6670375,
 'subject_fill_bias_prior_corrected': 0.4026971,
 'target_fill_bias': 0.58334637}

In [44]:
bias_score("GGG likes XXX", # no period -> wildly different results (TODO: Understand better)
           ["he", "she"], "sports")

{'subject_fill_bias': 1.4502449,
 'subject_fill_prior_correction': 1.3918098,
 'subject_fill_bias_prior_corrected': 0.058435082,
 'target_fill_bias': 0.5290384}

### Now trying something slightly different

In [41]:
bias_score("my GGG is a XXX.", ["father", "mother"], "programmer")

{'subject_fill_bias': 1.1516924,
 'subject_fill_prior_correction': -1.2207391,
 'subject_fill_bias_prior_corrected': 2.3724315,
 'target_fill_bias': 0.21045876}

In [42]:
bias_score("my GGG likes .", ["father", "mother"], "math")

{'subject_fill_bias': -0.30787182,
 'subject_fill_prior_correction': -1.5834278,
 'subject_fill_bias_prior_corrected': 1.275556,
 'target_fill_bias': nan}

In [43]:
bias_score("my GGG likes XXX.", ["father", "mother"], "science")

{'subject_fill_bias': -0.22670174,
 'subject_fill_prior_correction': -3.4782138,
 'subject_fill_bias_prior_corrected': 3.251512,
 'target_fill_bias': 0.2664162}