In [1]:
import os
import csv
import json
import math
import torch
import argparse
import difflib
import logging
import numpy as np
import pandas as pd

from transformers import BertTokenizer, BertForMaskedLM
from transformers import AlbertTokenizer, AlbertForMaskedLM
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
from transformers import AutoModel, AutoTokenizer
# 
from collections import defaultdict
from tqdm import tqdm

from crows_pairs_methods import *

In [2]:
BERT_models = [
    'bert-base-cased',
    'bert-base-uncased',
    'bert-large-uncased',
    'bert-large-cased',
    'bert-base-multilingual-uncased',
    'bert-base-multilingual-cased',
    'allenai/scibert_scivocab_uncased',
    'emilyalsentzer/Bio_ClinicalBERT',
    'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
    'ProsusAI/finbert',
    'nlpaueb/legal-bert-base-uncased',
    'GroNLP/hateBERT',
    'anferico/bert-for-patents',
    'jackaduma/SecBERT'
]

ALBERT_models = [
    'albert-base-v1',
    'albert-base-v2'
]

ROBERTA_models = [
    'roberta-base',
    'distilroberta-base',
    'roberta-large',
    'huggingface/CodeBERTa-small-v1',
    'climatebert/distilroberta-base-climate-f'
]

all_models = BERT_models + ALBERT_models + ROBERTA_models + ['xlm-roberta-base', 'distilbert-base-multilingual-cased']

In [3]:
dataframe_dictionary = {'models' : [],
'metric_scores' : [],
'stereotype_scores' : [],
'antistereotype_scores' : []}

In [4]:
for model_name in all_models:
    logging.basicConfig(level=logging.INFO)

    # load data into panda DataFrame
    df_data = read_data("fixed_data.csv")

    # Filtering to Disability Data
    df_data = df_data[df_data['bias_type']=='gender']
    
    # supported masked language models (using bert)
    if model_name in BERT_models:
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertForMaskedLM.from_pretrained(model_name)
    elif model_name in ALBERT_models:
        tokenizer = AlbertTokenizer.from_pretrained(model_name)
        model = AlbertForMaskedLM.from_pretrained(model_name)
    elif model_name in ROBERTA_models:
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaForMaskedLM.from_pretrained(model_name)
    elif model_name == 'xlm-roberta-base':
        tokenizer = XMLRobertaTokenizer.from_pretrained(model_name)
        model = XMLRobertaForMaskedLM.from_pretrained(model_name)
    elif model_name == 'distilbert-base-multilingual-cased':
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        model = DistilBertForMaskedLM.from_pretrained(model_name)
        
    mask_token = tokenizer.mask_token
    log_softmax = torch.nn.LogSoftmax(dim=0)
    vocab = tokenizer.get_vocab()
    with open("bert" + ".vocab", "w") as f:
        f.write(json.dumps(vocab))

    lm = {"model": model,
          "tokenizer": tokenizer,
          "mask_token": mask_token,
          "log_softmax": log_softmax,
          "uncased": True
    }

    # score each sentence. 
    # each row in the dataframe has the sentid and score for pro and anti stereo.
    df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 
                                     'sent_more_score', 'sent_less_score',
                                     'score', 'stereo_antistereo', 'bias_type'], dtype=object)
    
    total_stereo, total_antistereo = 0, 0
    stereo_score, antistereo_score = 0, 0

    N = 0
    neutral = 0
    total = len(df_data.index)
    with tqdm(total=total) as pbar:
        for index, data in df_data.iterrows():
            direction = data['direction']
            bias = data['bias_type']
            score = mask_unigram(data, lm)

            for stype in score.keys():
                score[stype] = round(score[stype], 3)

            N += 1
            pair_score = 0
            pbar.update(1)
            if score['sent1_score'] == score['sent2_score']:
                neutral += 1
            else:
                if direction == 'stereo':
                    total_stereo += 1
                    if score['sent1_score'] > score['sent2_score']:
                        stereo_score += 1
                        pair_score = 1
                elif direction == 'antistereo':
                    total_antistereo += 1
                    if score['sent2_score'] > score['sent1_score']:
                        antistereo_score += 1
                        pair_score = 1

            sent_more, sent_less = '', ''
            if direction == 'stereo':
                sent_more = data['sent1']
                sent_less = data['sent2']
                sent_more_score = score['sent1_score']
                sent_less_score = score['sent2_score']
            else:
                sent_more = data['sent2']
                sent_less = data['sent1']
                sent_more_score = score['sent2_score']
                sent_less_score = score['sent1_score']

            df_score = df_score.append({'sent_more': sent_more,
                                        'sent_less': sent_less,
                                        'sent_more_score': sent_more_score,
                                        'sent_less_score': sent_less_score,
                                        'score': pair_score,
                                        'stereo_antistereo': direction,
                                        'bias_type': bias
                                      }, ignore_index=True)
    
    dataframe_dictionary['models'].append(model_name)
    dataframe_dictionary['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
    dataframe_dictionary['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
    dataframe_dictionary['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
    
    print(pd.DataFrame(dataframe_dictionary))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [11:34<00:00,  2.65s/it]


            models  metric_scores  stereotype_scores  antistereotype_scores
0  bert-base-cased          55.73              57.86                  52.43


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [11:24<00:00,  2.61s/it]


              models  metric_scores  stereotype_scores  antistereotype_scores
0    bert-base-cased          55.73              57.86                  52.43
1  bert-base-uncased          58.02              55.35                  62.14


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [37:22<00:00,  8.56s/it]


               models  metric_scores  stereotype_scores  antistereotype_scores
0     bert-base-cased          55.73              57.86                  52.43
1   bert-base-uncased          58.02              55.35                  62.14
2  bert-large-uncased          55.34              54.72                  56.31


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [38:27<00:00,  8.81s/it]


               models  metric_scores  stereotype_scores  antistereotype_scores
0     bert-base-cased          55.73              57.86                  52.43
1   bert-base-uncased          58.02              55.35                  62.14
2  bert-large-uncased          55.34              54.72                  56.31
3    bert-large-cased          52.29              55.35                  47.57


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [17:52<00:00,  4.09s/it]


                           models  metric_scores  stereotype_scores  \
0                 bert-base-cased          55.73              57.86   
1               bert-base-uncased          58.02              55.35   
2              bert-large-uncased          55.34              54.72   
3                bert-large-cased          52.29              55.35   
4  bert-base-multilingual-uncased          53.05              52.83   

   antistereotype_scores  
0                  52.43  
1                  62.14  
2                  56.31  
3                  47.57  
4                  53.40  


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [19:33<00:00,  4.48s/it]


                           models  metric_scores  stereotype_scores  \
0                 bert-base-cased          55.73              57.86   
1               bert-base-uncased          58.02              55.35   
2              bert-large-uncased          55.34              54.72   
3                bert-large-cased          52.29              55.35   
4  bert-base-multilingual-uncased          53.05              52.83   
5    bert-base-multilingual-cased          45.04              46.54   

   antistereotype_scores  
0                  52.43  
1                  62.14  
2                  56.31  
3                  47.57  
4                  53.40  
5                  42.72  


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [12:42<00:00,  2.91s/it]


                             models  metric_scores  stereotype_scores  \
0                   bert-base-cased          55.73              57.86   
1                 bert-base-uncased          58.02              55.35   
2                bert-large-uncased          55.34              54.72   
3                  bert-large-cased          52.29              55.35   
4    bert-base-multilingual-uncased          53.05              52.83   
5      bert-base-multilingual-cased          45.04              46.54   
6  allenai/scibert_scivocab_uncased          44.27              35.85   

   antistereotype_scores  
0                  52.43  
1                  62.14  
2                  56.31  
3                  47.57  
4                  53.40  
5                  42.72  
6                  57.28  


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [12:25<00:00,  2.85s/it]


                             models  metric_scores  stereotype_scores  \
0                   bert-base-cased          55.73              57.86   
1                 bert-base-uncased          58.02              55.35   
2                bert-large-uncased          55.34              54.72   
3                  bert-large-cased          52.29              55.35   
4    bert-base-multilingual-uncased          53.05              52.83   
5      bert-base-multilingual-cased          45.04              46.54   
6  allenai/scibert_scivocab_uncased          44.27              35.85   
7   emilyalsentzer/Bio_ClinicalBERT          50.00              48.43   

   antistereotype_scores  
0                  52.43  
1                  62.14  
2                  56.31  
3                  47.57  
4                  53.40  
5                  42.72  
6                  57.28  
7                  52.43  


Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [14:11<00:00,  3.25s/it]


                                              models  metric_scores  \
0                                    bert-base-cased          55.73   
1                                  bert-base-uncased          58.02   
2                                 bert-large-uncased          55.34   
3                                   bert-large-cased          52.29   
4                     bert-base-multilingual-uncased          53.05   
5                       bert-base-multilingual-cased          45.04   
6                   allenai/scibert_scivocab_uncased          44.27   
7                    emilyalsentzer/Bio_ClinicalBERT          50.00   
8  microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   

   stereotype_scores  antistereotype_scores  
0              57.86                  52.43  
1              55.35                  62.14  
2              54.72                  56.31  
3              55.35                  47.57  
4              52.83                  53.40  
5            

Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertForMaskedLM: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
You should probably TRAIN this 

                                              models  metric_scores  \
0                                    bert-base-cased          55.73   
1                                  bert-base-uncased          58.02   
2                                 bert-large-uncased          55.34   
3                                   bert-large-cased          52.29   
4                     bert-base-multilingual-uncased          53.05   
5                       bert-base-multilingual-cased          45.04   
6                   allenai/scibert_scivocab_uncased          44.27   
7                    emilyalsentzer/Bio_ClinicalBERT          50.00   
8  microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                   ProsusAI/finbert          49.24   

   stereotype_scores  antistereotype_scores  
0              57.86                  52.43  
1              55.35                  62.14  
2              54.72                  56.31  
3              55.35              

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [12:41<00:00,  2.91s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          55.73   
1                                   bert-base-uncased          58.02   
2                                  bert-large-uncased          55.34   
3                                    bert-large-cased          52.29   
4                      bert-base-multilingual-uncased          53.05   
5                        bert-base-multilingual-cased          45.04   
6                    allenai/scibert_scivocab_uncased          44.27   
7                     emilyalsentzer/Bio_ClinicalBERT          50.00   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                    ProsusAI/finbert          49.24   
10                    nlpaueb/legal-bert-base-uncased          51.53   

    stereotype_scores  antistereotype_scores  
0               57.86                  52.43  
1               55.35                  62

100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [11:18<00:00,  2.59s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          55.73   
1                                   bert-base-uncased          58.02   
2                                  bert-large-uncased          55.34   
3                                    bert-large-cased          52.29   
4                      bert-base-multilingual-uncased          53.05   
5                        bert-base-multilingual-cased          45.04   
6                    allenai/scibert_scivocab_uncased          44.27   
7                     emilyalsentzer/Bio_ClinicalBERT          50.00   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                    ProsusAI/finbert          49.24   
10                    nlpaueb/legal-bert-base-uncased          51.53   
11                                    GroNLP/hateBERT          52.67   

    stereotype_scores  antistereotype_scores  
0               

Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [37:41<00:00,  8.63s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          55.73   
1                                   bert-base-uncased          58.02   
2                                  bert-large-uncased          55.34   
3                                    bert-large-cased          52.29   
4                      bert-base-multilingual-uncased          53.05   
5                        bert-base-multilingual-cased          45.04   
6                    allenai/scibert_scivocab_uncased          44.27   
7                     emilyalsentzer/Bio_ClinicalBERT          50.00   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                    ProsusAI/finbert          49.24   
10                    nlpaueb/legal-bert-base-uncased          51.53   
11                                    GroNLP/hateBERT          52.67   
12                          anferico/bert-for-patents          4

100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [09:09<00:00,  2.10s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          55.73   
1                                   bert-base-uncased          58.02   
2                                  bert-large-uncased          55.34   
3                                    bert-large-cased          52.29   
4                      bert-base-multilingual-uncased          53.05   
5                        bert-base-multilingual-cased          45.04   
6                    allenai/scibert_scivocab_uncased          44.27   
7                     emilyalsentzer/Bio_ClinicalBERT          50.00   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                    ProsusAI/finbert          49.24   
10                    nlpaueb/legal-bert-base-uncased          51.53   
11                                    GroNLP/hateBERT          52.67   
12                          anferico/bert-for-patents          4

100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [09:41<00:00,  2.22s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          55.73   
1                                   bert-base-uncased          58.02   
2                                  bert-large-uncased          55.34   
3                                    bert-large-cased          52.29   
4                      bert-base-multilingual-uncased          53.05   
5                        bert-base-multilingual-cased          45.04   
6                    allenai/scibert_scivocab_uncased          44.27   
7                     emilyalsentzer/Bio_ClinicalBERT          50.00   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                    ProsusAI/finbert          49.24   
10                    nlpaueb/legal-bert-base-uncased          51.53   
11                                    GroNLP/hateBERT          52.67   
12                          anferico/bert-for-patents          4

100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [10:27<00:00,  2.39s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          55.73   
1                                   bert-base-uncased          58.02   
2                                  bert-large-uncased          55.34   
3                                    bert-large-cased          52.29   
4                      bert-base-multilingual-uncased          53.05   
5                        bert-base-multilingual-cased          45.04   
6                    allenai/scibert_scivocab_uncased          44.27   
7                     emilyalsentzer/Bio_ClinicalBERT          50.00   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                    ProsusAI/finbert          49.24   
10                    nlpaueb/legal-bert-base-uncased          51.53   
11                                    GroNLP/hateBERT          52.67   
12                          anferico/bert-for-patents          4

100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [13:03<00:00,  2.99s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          55.73   
1                                   bert-base-uncased          58.02   
2                                  bert-large-uncased          55.34   
3                                    bert-large-cased          52.29   
4                      bert-base-multilingual-uncased          53.05   
5                        bert-base-multilingual-cased          45.04   
6                    allenai/scibert_scivocab_uncased          44.27   
7                     emilyalsentzer/Bio_ClinicalBERT          50.00   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                    ProsusAI/finbert          49.24   
10                    nlpaueb/legal-bert-base-uncased          51.53   
11                                    GroNLP/hateBERT          52.67   
12                          anferico/bert-for-patents          4

100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [08:14<00:00,  1.89s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          55.73   
1                                   bert-base-uncased          58.02   
2                                  bert-large-uncased          55.34   
3                                    bert-large-cased          52.29   
4                      bert-base-multilingual-uncased          53.05   
5                        bert-base-multilingual-cased          45.04   
6                    allenai/scibert_scivocab_uncased          44.27   
7                     emilyalsentzer/Bio_ClinicalBERT          50.00   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                    ProsusAI/finbert          49.24   
10                    nlpaueb/legal-bert-base-uncased          51.53   
11                                    GroNLP/hateBERT          52.67   
12                          anferico/bert-for-patents          4

100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [38:27<00:00,  8.81s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          55.73   
1                                   bert-base-uncased          58.02   
2                                  bert-large-uncased          55.34   
3                                    bert-large-cased          52.29   
4                      bert-base-multilingual-uncased          53.05   
5                        bert-base-multilingual-cased          45.04   
6                    allenai/scibert_scivocab_uncased          44.27   
7                     emilyalsentzer/Bio_ClinicalBERT          50.00   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                    ProsusAI/finbert          49.24   
10                    nlpaueb/legal-bert-base-uncased          51.53   
11                                    GroNLP/hateBERT          52.67   
12                          anferico/bert-for-patents          4

100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [09:40<00:00,  2.22s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          55.73   
1                                   bert-base-uncased          58.02   
2                                  bert-large-uncased          55.34   
3                                    bert-large-cased          52.29   
4                      bert-base-multilingual-uncased          53.05   
5                        bert-base-multilingual-cased          45.04   
6                    allenai/scibert_scivocab_uncased          44.27   
7                     emilyalsentzer/Bio_ClinicalBERT          50.00   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                    ProsusAI/finbert          49.24   
10                    nlpaueb/legal-bert-base-uncased          51.53   
11                                    GroNLP/hateBERT          52.67   
12                          anferico/bert-for-patents          4

100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [08:12<00:00,  1.88s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          55.73   
1                                   bert-base-uncased          58.02   
2                                  bert-large-uncased          55.34   
3                                    bert-large-cased          52.29   
4                      bert-base-multilingual-uncased          53.05   
5                        bert-base-multilingual-cased          45.04   
6                    allenai/scibert_scivocab_uncased          44.27   
7                     emilyalsentzer/Bio_ClinicalBERT          50.00   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                    ProsusAI/finbert          49.24   
10                    nlpaueb/legal-bert-base-uncased          51.53   
11                                    GroNLP/hateBERT          52.67   
12                          anferico/bert-for-patents          4

NameError: name 'XMLRobertaTokenizer' is not defined

In [None]:
gender_dataframe = {
    'models' : all_models,
    'metric_scores' : [55.73, 58.02, 55.34, 
                       52.29, 53.05, 45.04, 
                       57.25, 44.27, 50, 
                       51.91, 45.8, 51.53, 
                       52.67, 45.42, 46.56, 
                       53.44, 54.2, 54.96, 
                       53.82, 51.91, 50.38,
                       55.73, 51.15],
    'stereotype_scores' : [57.86, 55.35, 54.72, 
                           55.35, 52.83, 46.54, 
                           65.41, 35.85, 48.43, 
                           50.94, 44.65, 50.31, 
                           49.69, 45.28, 40.25, 
                           52.83, 47.17, 59.12, 
                           60.38, 55.97, 50.94, 
                           50.94, 50.31],
    'antistereotype_scores' : [52.43, 62.14, 56.31, 
                               47.57, 53.4, 42.72, 
                               44.66, 57.28, 52.43, 
                               53.4, 47.57, 53.4, 
                               57.28, 45.63, 56.31, 
                               54.37, 65.05, 48.54, 
                               43.69, 45.63, 49.51, 
                               63.11, 52.43]
}

In [5]:
for model_name in ['xlm-roberta-base', 'distilbert-base-multilingual-cased']:
    logging.basicConfig(level=logging.INFO)

    # load data into panda DataFrame
    df_data = read_data("fixed_data.csv")

    # Filtering to Disability Data
    df_data = df_data[df_data['bias_type']=='gender']
    
    # supported masked language models (using bert)
    if model_name in BERT_models:
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertForMaskedLM.from_pretrained(model_name)
    elif model_name in ALBERT_models:
        tokenizer = AlbertTokenizer.from_pretrained(model_name)
        model = AlbertForMaskedLM.from_pretrained(model_name)
    elif model_name in ROBERTA_models:
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaForMaskedLM.from_pretrained(model_name)
    elif model_name == 'xlm-roberta-base':
        tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
        model = XLMRobertaForMaskedLM.from_pretrained(model_name)
    elif model_name == 'distilbert-base-multilingual-cased':
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        model = DistilBertForMaskedLM.from_pretrained(model_name)
        
    mask_token = tokenizer.mask_token
    log_softmax = torch.nn.LogSoftmax(dim=0)
    vocab = tokenizer.get_vocab()
    with open("bert" + ".vocab", "w") as f:
        f.write(json.dumps(vocab))

    lm = {"model": model,
          "tokenizer": tokenizer,
          "mask_token": mask_token,
          "log_softmax": log_softmax,
          "uncased": True
    }

    # score each sentence. 
    # each row in the dataframe has the sentid and score for pro and anti stereo.
    df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 
                                     'sent_more_score', 'sent_less_score',
                                     'score', 'stereo_antistereo', 'bias_type'], dtype=object)
    
    total_stereo, total_antistereo = 0, 0
    stereo_score, antistereo_score = 0, 0

    N = 0
    neutral = 0
    total = len(df_data.index)
    with tqdm(total=total) as pbar:
        for index, data in df_data.iterrows():
            direction = data['direction']
            bias = data['bias_type']
            score = mask_unigram(data, lm)

            for stype in score.keys():
                score[stype] = round(score[stype], 3)

            N += 1
            pair_score = 0
            pbar.update(1)
            if score['sent1_score'] == score['sent2_score']:
                neutral += 1
            else:
                if direction == 'stereo':
                    total_stereo += 1
                    if score['sent1_score'] > score['sent2_score']:
                        stereo_score += 1
                        pair_score = 1
                elif direction == 'antistereo':
                    total_antistereo += 1
                    if score['sent2_score'] > score['sent1_score']:
                        antistereo_score += 1
                        pair_score = 1

            sent_more, sent_less = '', ''
            if direction == 'stereo':
                sent_more = data['sent1']
                sent_less = data['sent2']
                sent_more_score = score['sent1_score']
                sent_less_score = score['sent2_score']
            else:
                sent_more = data['sent2']
                sent_less = data['sent1']
                sent_more_score = score['sent2_score']
                sent_less_score = score['sent1_score']

            df_score = df_score.append({'sent_more': sent_more,
                                        'sent_less': sent_less,
                                        'sent_more_score': sent_more_score,
                                        'sent_less_score': sent_less_score,
                                        'score': pair_score,
                                        'stereo_antistereo': direction,
                                        'bias_type': bias
                                      }, ignore_index=True)
    
    dataframe_dictionary['models'].append(model_name)
    dataframe_dictionary['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
    dataframe_dictionary['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
    dataframe_dictionary['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
    
    print(pd.DataFrame(dataframe_dictionary))

100%|██████████████████████████████████████████████████████████████████████████████| 262/262 [1:14:50<00:00, 17.14s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          55.73   
1                                   bert-base-uncased          58.02   
2                                  bert-large-uncased          55.34   
3                                    bert-large-cased          52.29   
4                      bert-base-multilingual-uncased          53.05   
5                        bert-base-multilingual-cased          45.04   
6                    allenai/scibert_scivocab_uncased          44.27   
7                     emilyalsentzer/Bio_ClinicalBERT          50.00   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                    ProsusAI/finbert          49.24   
10                    nlpaueb/legal-bert-base-uncased          51.53   
11                                    GroNLP/hateBERT          52.67   
12                          anferico/bert-for-patents          4

100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [21:11<00:00,  4.85s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          55.73   
1                                   bert-base-uncased          58.02   
2                                  bert-large-uncased          55.34   
3                                    bert-large-cased          52.29   
4                      bert-base-multilingual-uncased          53.05   
5                        bert-base-multilingual-cased          45.04   
6                    allenai/scibert_scivocab_uncased          44.27   
7                     emilyalsentzer/Bio_ClinicalBERT          50.00   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...          51.91   
9                                    ProsusAI/finbert          49.24   
10                    nlpaueb/legal-bert-base-uncased          51.53   
11                                    GroNLP/hateBERT          52.67   
12                          anferico/bert-for-patents          4

In [7]:
gender_dataframe = {
    'models' : all_models,
    'metric_scores' : [55.73, 58.02, 55.34, 
                       52.29, 53.05, 45.04, 
                       44.27, 50, 51.91, 
                       49.24, 51.53, 52.67, 
                       45.42, 46.56, 53.44, 
                       54.2, 54.96, 53.82, 
                       51.91, 55.73, 51.15, 
                       50.38, 46.56],
    'stereotype_scores' : [57.86, 55.35, 54.72, 
                           55.35, 52.83, 46.54, 
                           35.85, 48.43, 50.94, 
                           61.64, 50.31, 49.69, 
                           45.28, 40.25, 52.83, 
                           47.17, 59.12, 60.38, 
                           55.97, 50.94, 50.31, 
                           50.94, 43.40],
    'antistereotype_scores' : [52.43, 62.14, 56.31, 
                               47.57, 53.4, 42.72, 
                               57.28, 52.43, 53.4, 
                               30.10, 53.4, 57.28, 
                               45.63, 56.31, 54.37, 
                               65.05, 48.54, 43.69, 
                               45.63, 63.11, 52.43, 
                               49.51, 51.46]
}

In [8]:
pd.DataFrame(gender_dataframe)

Unnamed: 0,models,metric_scores,stereotype_scores,antistereotype_scores
0,bert-base-cased,55.73,57.86,52.43
1,bert-base-uncased,58.02,55.35,62.14
2,bert-large-uncased,55.34,54.72,56.31
3,bert-large-cased,52.29,55.35,47.57
4,bert-base-multilingual-uncased,53.05,52.83,53.4
5,bert-base-multilingual-cased,45.04,46.54,42.72
6,allenai/scibert_scivocab_uncased,44.27,35.85,57.28
7,emilyalsentzer/Bio_ClinicalBERT,50.0,48.43,52.43
8,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,51.91,50.94,53.4
9,ProsusAI/finbert,49.24,61.64,30.1


Almost every model's scores have remain unchanged, except for <i>ProsusAI/finbert</i> and <i>distilbert-base-multilingual-cased</i>. These two models had changes in all three scores (metric, stereotype and anti-stereotype). These changes are shown below:

<b><u>ProsusAI/finbert</u></b>
* Metric Score:
    * 45.80 (+7.5%) -> 49.24
* Stereotype Score:
    * 44.65 (+38%) -> 61.64
* Anti-stereotype Score:
    * 47.57 (-36.7%) -> 30.10
    
<b><u>distilbert-base-multilingual-cased</u></b>
* Metric Score
    * 57.25 (-18.7%) -> 46.56
* Stereotype Score
    * 65.41 (-33.6%) -> 43.40
* Antistereotype Score
    * 44.66 (+15.2%) -> 51.46
    
You can see from the above figures that these models have been heavily affected by the changes made to the CrowS-Pairs dataset.