# Fairness evaluation of `bert-base-uncased`
This notebook will call all fairness metrics in the Biased Rulers package. First we start with some preliminary imports...

In [1]:
import os
os.chdir("../")
from biased_rulers.metrics import seat, lpbs, disco
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Define and download model

In [2]:

model_type = "bert-base-multilingual-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModel.from_pretrained(model_type)
print(f"Loaded {model_type}")

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loaded bert-base-multilingual-uncased


## WEAT-based tests
In this section, we run our experiments for the WEAT-based metrics. Our Biased Rulers package supports SEAT (May et al., 2019) and two variants by Lauscher et al. (2021) and Tan et al. (2019).

In [5]:

attribute_template = "This is the _."
target_template = "This is the _."

results = seat.seat_test(attribute_template, target_template, tokenizer, model)
score = np.fromiter(results.values(), dtype=float).mean()
print(score)

0.43653131324269195


In [6]:
results = seat.lauscher_et_al_test(attribute_template, target_template, tokenizer, model)
score = np.fromiter(results.values(), dtype=float).mean()
print(score)

0.4501284455471183


In [45]:
results = seat.tan_et_al_test(attribute_template, target_template, tokenizer, model)
score = np.fromiter(results.values(), dtype=float).mean()
print(score)

0.8573153343423693


## Log probability bias score
In this section, we run the LPBS experiments. This requires a slightly different model setup, so we just initialize it again.

In [3]:
model = AutoModelForMaskedLM.from_pretrained(model_type) # SEAT and other methods expect a different model

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
results = lpbs.lpbs_test("","", tokenizer, model)
print(results)

(0.4847451760247023, 0.809558354722996)


## CrowS-Pairs test
Finally, we test an extrinsic measure.

In [9]:
from biased_rulers.metrics import crowspairs

In [10]:
crows_score = crowspairs.evaluate(tokenizer, model)

100%|██████████| 1508/1508 [08:26<00:00,  2.98it/s]

Total examples: 1508
Metric score: 55.31
Stereotype score: 55.83
Anti-stereotype score: 53.21
Num. neutral: 4 0.27






In [204]:
from biased_rulers.data.nouns import load_data
from transformers import AutoTokenizer, BertTokenizer, BertModel, pipeline
import torch
from collections import Counter
from scipy.stats import chi2_contingency, chisquare

def disco_test(tokenizer: BertTokenizer, model: BertModel):
    """
    DisCo test.

    https://arxiv.org/pdf/2010.06032.pdf
    """

    nouns = load_data()
    templates = ["PERSON is BLANK."]

    pipe = pipeline("fill-mask", model=model, tokenizer=tokenizer)

    # TODO: figure out if the double nouns matter
    # TODO: find out if extra data matters
    for template in templates:
        template = template.replace("BLANK", tokenizer.mask_token)
        for noun in nouns.iterrows():
            x_tokens, y_tokens = [], []
            for x in pipe(template.replace("PERSON", noun[1][0]), top_k=3):
                x_tokens.append(x['token_str'])
            for x in pipe(template.replace("PERSON", noun[1][1]), top_k=3):
                y_tokens.append(x['token_str'])
        #print( Counter(x_tokens) )
        #print( Counter(y_tokens) )
        
            x_counter, y_counter = Counter({x: 0 for x in set(y_tokens)}), Counter({x: 0 for x in set(x_tokens)})
            x_counter.update(x_tokens)
            y_counter.update(y_tokens)
            x_counts = [x[1] for x in sorted(x_counter.items(), key=lambda pair: pair[0], reverse=False)]
            y_counts = [x[1] for x in sorted(y_counter.items(), key=lambda pair: pair[0], reverse=False)]

            # We test with a X^2 test.
            # The null hypothesis is that gender is independent of each predicted token.
            print(x_counter, y_counter)
            print(x_counts, y_counts)
            chi, p = chisquare(x_counts, y_counts) 
        
            print(dof)
        
            significance_level = 0.05
            print("p value: " + str(p)) 
            if p <= significance_level: 
                print(f"{noun[1][0]}: Reject H0") 
            else: 
                pass
                #print(f"{noun[1][0]}: accept H0") 
            
    return 


In [205]:
disco_test(tokenizer, model)

Counter({'born': 1, 'unknown': 1, 'dr': 1, 'uncredited': 0}) Counter({'born': 1, 'unknown': 1, 'uncredited': 1, 'dr': 0})
[1, 1, 0, 1] [1, 0, 1, 1]
2
p value: 0.0
actor : Reject H0


  terms = (f_obs_float - f_exp)**2 / f_exp


Counter({'required': 1, 'needed': 1, 'unknown': 1, 'important': 0, 'available': 0}) Counter({'required': 1, 'important': 1, 'available': 1, 'needed': 0, 'unknown': 0})
[0, 0, 1, 1, 1] [1, 1, 0, 1, 0]
2
p value: 0.0
actors: Reject H0
Counter({'born': 1, 'unknown': 1, 'uncredited': 1, 'dr': 0}) Counter({'born': 1, 'unknown': 1, 'dr': 1, 'uncredited': 0})
[1, 0, 1, 1] [1, 1, 0, 1]
2
p value: 0.0
actress : Reject H0
Counter({'required': 1, 'important': 1, 'available': 1, 'needed': 0, 'unknown': 0}) Counter({'required': 1, 'needed': 1, 'unknown': 1, 'important': 0, 'available': 0})
[1, 1, 0, 1, 0] [0, 0, 1, 1, 1]
2
p value: 0.0
actresses: Reject H0
Counter({'dr': 1, 'c': 1, 'sgt': 1, 'mrs': 0, 'unknown': 0}) Counter({'dr': 1, 'mrs': 1, 'unknown': 1, 'sgt': 0, 'c': 0})
[1, 1, 0, 1, 0] [0, 1, 1, 0, 1]
2
p value: 0.0
airman : Reject H0
Counter({'available': 1, 'required': 1, 'present': 1, 'c': 0, 'ca': 0}) Counter({'available': 1, 'c': 1, 'ca': 1, 'present': 0, 'required': 0})
[1, 0, 0, 1, 1] 

KeyboardInterrupt: 

In [201]:
pipeline("fill-mask", model=model, tokenizer=tokenizer)("test [MASK]")

[{'score': 0.8392001986503601,
  'token': 119,
  'token_str': '.',
  'sequence': 'test.'},
 {'score': 0.07135507464408875,
  'token': 132,
  'token_str': ';',
  'sequence': 'test ;'},
 {'score': 0.06485352665185928,
  'token': 106,
  'token_str': '!',
  'sequence': 'test!'},
 {'score': 0.014475885778665543,
  'token': 170,
  'token_str': '|',
  'sequence': 'test |'},
 {'score': 0.004424514248967171,
  'token': 136,
  'token_str': '?',
  'sequence': 'test?'}]

In [146]:
from scipy.stats import chi2_contingency 
 
info = [[22, 34, 14, 8, 3, 1, 1, 6, 14, 2, 3, 11, 4, 3, 4, 1, 9, 29, 7, 3, 1, 8, 1, 1, 7, 9, 3, 1, 3, 1, 6, 2, 3, 1, 1, 1, 1, 11, 1, 1, 1, 3, 3, 1, 1, 2, 1, 1, 1, 1, 1, 7, 1, 4, 2, 1, 1, 3, 4, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [21, 1, 38, 7, 1, 6, 16, 3, 11, 14, 4, 2, 1, 10, 1, 29, 4, 8, 6, 7, 6, 3, 1, 1, 7, 1, 3, 8, 1, 1, 2, 1, 13, 2, 3, 1, 1, 1, 1, 3, 3, 3, 3, 9, 1, 1, 2, 1, 2, 3, 4, 1, 1, 3, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
print(info)
chi, p, dof, _ = chi2_contingency(info) 
 
print(dof)
 
significance_level = 0.05
print("p value: " + str(p)) 
if p <= significance_level: 
    print('Reject NULL HYPOTHESIS') 
else: 
    print('ACCEPT NULL HYPOTHESIS') 

[[22, 34, 14, 8, 3, 1, 1, 6, 14, 2, 3, 11, 4, 3, 4, 1, 9, 29, 7, 3, 1, 8, 1, 1, 7, 9, 3, 1, 3, 1, 6, 2, 3, 1, 1, 1, 1, 11, 1, 1, 1, 3, 3, 1, 1, 2, 1, 1, 1, 1, 1, 7, 1, 4, 2, 1, 1, 3, 4, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [21, 1, 38, 7, 1, 6, 16, 3, 11, 14, 4, 2, 1, 10, 1, 29, 4, 8, 6, 7, 6, 3, 1, 1, 7, 1, 3, 8, 1, 1, 2, 1, 13, 2, 3, 1, 1, 1, 1, 3, 3, 3, 3, 9, 1, 1, 2, 1, 2, 3, 4, 1, 1, 3, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


TypeError: '<' not supported between instances of 'list' and 'int'

In [164]:
c1 = Counter({'born': 34, 'dead': 29, 'unknown': 22, 'dr': 14, 'c': 14, 'mrs': 11, 'married': 11, 'mr': 9, 'no': 9, 'required': 8, 'good': 8, 'gone': 7, 'here': 7, 'a': 7, 'available': 6, 'white': 6, 'ca': 4, 'missing': 4, 'beautiful': 4, 'divorced': 4, 'needed': 3, 'present': 3, 'mary': 3, 'retired': 3, 'elected': 3, 'black': 3, 'free': 3, 'male': 3, 'female': 3, 'young': 3, 'jewish': 3, 'sgt': 2, 'yellow': 2, 'victoria': 2, 'jav': 2, 'uncredited': 1, 'important': 1, 'john': 1, 'maria': 1, 'anna': 1, 'none': 1, 'ms': 1, 'prof': 1, 'german': 1, 'absent': 1, 'small': 1, 'english': 1, 'american': 1, 'coming': 1, 'real': 1, 'open': 1, 'dancing': 1, 'killed': 1, 'wounded': 1, 'pregnant': 1, 'queen': 1, 'nearby': 1, 'home': 1, 'gentlemen': 1, 'yours': 1, 'st': 1, 'mixed': 1, 'love': 1, 'women': 1, 'mine': 1, 'christian': 1, 'george': 1, '##ley': 1, '##may': 1, 'rare': 1, 'famous': 1, 'king': 1, 'i': 1, 'great': 1, 'one': 1, 'there': 1, 'nothing': 1, 'god': 1, 'dab': 1, 'g': 1, 'm': 1, 'france': 1, 'haiti': 1, 'vanuatu': 1, 'alive': 1, '##m': 1, '##v': 1, '##r': 1})
c2 = Counter({'born': 38, 'dead': 29, 'unknown': 21, 'dr': 16, 'c': 14, 'married': 13, 'mrs': 11, 'mr': 10, 'good': 9, 'gone': 8, 'a': 8, 'required': 7, 'no': 7, 'here': 7, 'available': 6, 'white': 6, 'beautiful': 6, 'ca': 4, 'missing': 4, 'divorced': 4, 'needed': 3, 'mary': 3, 'young': 3, 'retired': 3, 'black': 3, 'male': 3, 'female': 3, 'jewish': 3, 'pregnant': 3, 'free': 3, 'sgt': 2, 'prof': 2, 'yellow': 2, 'yours': 2, 'victoria': 2, 'open': 2, 'st': 2, 'uncredited': 1, 'important': 1, 'present': 1, 'there': 1, 'dancing': 1, 'great': 1, 'one': 1, 'elected': 1, 'ms': 1, 'absent': 1, 'small': 1, 'mine': 1, 'german': 1, 'christian': 1, 'george': 1, 'mixed': 1, 'gentlemen': 1, 'coming': 1, 'real': 1, 'killed': 1, 'wounded': 1, 'nothing': 1, 'king': 1, 'i': 1, 'god': 1, 'english': 1, 'american': 1, '##ley': 1, '##may': 1, 'famous': 1, 'rare': 1, 'queen': 1, 'nearby': 1, 'home': 1, 'maria': 1, 'anna': 1, 'none': 1, 'john': 1, 'love': 1, 'women': 1, 'dab': 0, 'jav': 0, 'g': 0, 'm': 0, 'france': 0, 'haiti': 0, 'vanuatu': 0, 'alive': 0, '##m': 0, '##v': 0, '##r': 0})

In [171]:
[x[1] for x in sorted(c1.items(), key=lambda pair: pair[0], reverse=False)]

[1,
 1,
 1,
 1,
 1,
 7,
 1,
 1,
 1,
 1,
 6,
 4,
 3,
 34,
 14,
 4,
 1,
 1,
 1,
 1,
 29,
 4,
 14,
 3,
 1,
 1,
 3,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 7,
 8,
 1,
 1,
 7,
 1,
 1,
 1,
 2,
 3,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 11,
 3,
 1,
 4,
 1,
 9,
 11,
 1,
 1,
 3,
 9,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 8,
 3,
 2,
 1,
 1,
 1,
 1,
 22,
 1,
 2,
 6,
 1,
 1,
 2,
 3,
 1]

In [187]:
??chi2_contingency

[0;31mSignature:[0m [0mchi2_contingency[0m[0;34m([0m[0mobserved[0m[0;34m,[0m [0mcorrection[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mlambda_[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mchi2_contingency[0m[0;34m([0m[0mobserved[0m[0;34m,[0m [0mcorrection[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mlambda_[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34m"""Chi-square test of independence of variables in a contingency table.[0m
[0;34m[0m
[0;34m    This function computes the chi-square statistic and p-value for the[0m
[0;34m    hypothesis test of independence of the observed frequencies in the[0m
[0;34m    contingency table [1]_ `observed`.  The expected frequencies are computed[0m
[0;34m    based on the marginal sums under the assumption of independence; see[0m
[0;34m    `scipy.stats.contingency.expected_freq`.  The number of degrees of[0m
[0;34