In [8]:
import os
import csv
import json
import math
import torch
import argparse
import difflib
import logging
import numpy as np
import pandas as pd

from transformers import BertTokenizer, BertForMaskedLM
from transformers import AlbertTokenizer, AlbertForMaskedLM
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
from transformers import AutoModel, AutoTokenizer
# 
from collections import defaultdict
from tqdm import tqdm

In [9]:
from crows_pairs_methods import *

The list of models and their tokenizers that are running on the below tests are:

* <b>'bert-large-uncased'</b>: Using BertTokenizer and BertForMaskedLM
* <b>'albert-base-v2'</b>: Using AlbertTokenizer and AlbertForMaskedLM 
* <b>'distilroberta-base'</b>: Using RobertaTokenizer and RobertaForMaskedLM
* <b>'allenai/scibert_scivocab_uncased'</b>: Using BertTokenizer and BertForMaskedLM
* <b>'bert-base-multilingual-uncased'</b>: Using BertTokenizer and BertForMaskedLM
* <b>'bert-base-multilingual-cased'</b>: Using BertTokenizer and BertForMaskedLM
* <b>'roberta-base'</b>: Using RobertaTokenizer and RobertaForMaskedLM
* <b>'bert-base-cased'</b>: Using BertTokenizer and BertForMaskedLM
* <b>'xlm-roberta-base'</b>: Using XLMRobertaTokenizer, XLMRobertaForMaskedLM # multilingual roberta
* <b>'roberta-large'</b>: Using RobertaTokenizer and RobertaForMaskedLM
* <b>'distilbert-base-multilingual-cased'</b>: Using BertTokenizer and BertForMaskedLM
* <b>'emilyalsentzer/Bio_ClinicalBERT'</b>: Using BertTokenizer and BertForMaskedLM
* <b>'huggingface/CodeBERTa-small-v1'</b>: Using RobertaTokenizer and RobertaForMaskedLM
* <b>'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract'</b>: Using BertTokenizer and BertForMaskedLM
* <b>'ProsusAI/finbert'</b>: Using BertTokenizer and BertForMaskedLM
* <b>'nlpaueb/legal-bert-base-uncased'</b>: Using BertTokenizer and BertForMaskedLM
* <b>'vinai/bertweet-base'</b>: # doesnt work
* <b>'GroNLP/hateBERT'</b>: Using BertTokenizer and BertForMaskedLM #### very interesting possibly
* <b>'anferico/bert-for-patents'</b>: Using BertTokenizer and BertForMaskedLM ## (seems large)
* <b>'climatebert/distilroberta-base-climate-f'</b>: Using RobertaTokenizer and RobertaForMaskedLM
* <b>'jackaduma/SecBERT'</b>: Using BertTokenizer and BertForMaskedLM

Not tested are:
* <b>'xlm-roberta-large'</b>
* <b>'bert-large-cased'</b>
* <b>'facebook/muppet-roberta-large'</b>

In [20]:
BERT_models = [
    'bert-base-cased',
    'bert-base-uncased',
    'bert-large-uncased',
    'bert-large-cased',
    'bert-base-multilingual-uncased',
    'bert-base-multilingual-cased',
    'allenai/scibert_scivocab_uncased',
    'emilyalsentzer/Bio_ClinicalBERT',
    'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
    'ProsusAI/finbert',
    'nlpaueb/legal-bert-base-uncased',
    'GroNLP/hateBERT',
    'anferico/bert-for-patents',
    'jackaduma/SecBERT'
]

ALBERT_models = [
    'albert-base-v1',
    'albert-base-v2'
]

ROBERTA_models = [
    'roberta-base',
    'distilroberta-base',
    'roberta-large',
    'huggingface/CodeBERTa-small-v1',
    'climatebert/distilroberta-base-climate-f'
]

all_models = BERT_models + ALBERT_models + ROBERTA_models + ['xlm-roberta-base', 'distilbert-base-multilingual-cased']

In [90]:
dataframe_dictionary = {'models' : [],
'metric_scores' : [],
'stereotype_scores' : [],
'antistereotype_scores' : []}

In [91]:
for model_name in all_models:
    logging.basicConfig(level=logging.INFO)

    # load data into panda DataFrame
    df_data = read_data("crows_pairs_anonymized.csv")

    # Filtering to Disability Data
    df_data = df_data[df_data['bias_type']=='disability'][:5]
    
    # supported masked language models (using bert)
    if model_name in BERT_models:
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertForMaskedLM.from_pretrained(model_name)
    elif model_name in ALBERT_models:
        tokenizer = AlbertTokenizer.from_pretrained(model_name)
        model = AlbertForMaskedLM.from_pretrained(model_name)
    elif model_name in ROBERTA_models:
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaForMaskedLM.from_pretrained(model_name)
    elif model_name == 'xlm-roberta-base':
        tokenizer = XMLRobertaTokenizer.from_pretrained(model_name)
        model = XMLRobertaForMaskedLM.from_pretrained(model_name)
    elif model_name == 'distilbert-base-multilingual-cased':
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        model = DistilBertForMaskedLM.from_pretrained(model_name)
        
    mask_token = tokenizer.mask_token
    log_softmax = torch.nn.LogSoftmax(dim=0)
    vocab = tokenizer.get_vocab()
    with open("bert" + ".vocab", "w") as f:
        f.write(json.dumps(vocab))

    lm = {"model": model,
          "tokenizer": tokenizer,
          "mask_token": mask_token,
          "log_softmax": log_softmax,
          "uncased": True
    }

    # score each sentence. 
    # each row in the dataframe has the sentid and score for pro and anti stereo.
    df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 
                                     'sent_more_score', 'sent_less_score',
                                     'score', 'stereo_antistereo', 'bias_type'], dtype=object)
    
    total_stereo, total_antistereo = 0, 0
    stereo_score, antistereo_score = 0, 0

    N = 0
    neutral = 0
    total = len(df_data.index)
    with tqdm(total=total) as pbar:
        for index, data in df_data.iterrows():
            direction = data['direction']
            bias = data['bias_type']
            score = mask_unigram(data, lm)

            for stype in score.keys():
                score[stype] = round(score[stype], 3)

            N += 1
            pair_score = 0
            pbar.update(1)
            if score['sent1_score'] == score['sent2_score']:
                neutral += 1
            else:
                if direction == 'stereo':
                    total_stereo += 1
                    if score['sent1_score'] > score['sent2_score']:
                        stereo_score += 1
                        pair_score = 1
                elif direction == 'antistereo':
                    total_antistereo += 1
                    if score['sent2_score'] > score['sent1_score']:
                        antistereo_score += 1
                        pair_score = 1

            sent_more, sent_less = '', ''
            if direction == 'stereo':
                sent_more = data['sent1']
                sent_less = data['sent2']
                sent_more_score = score['sent1_score']
                sent_less_score = score['sent2_score']
            else:
                sent_more = data['sent2']
                sent_less = data['sent1']
                sent_more_score = score['sent2_score']
                sent_less_score = score['sent1_score']

            df_score = df_score.append({'sent_more': sent_more,
                                        'sent_less': sent_less,
                                        'sent_more_score': sent_more_score,
                                        'sent_less_score': sent_less_score,
                                        'score': pair_score,
                                        'stereo_antistereo': direction,
                                        'bias_type': bias
                                      }, ignore_index=True)
    
    dataframe_dictionary['models'].append(model_name)
    dataframe_dictionary['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
    dataframe_dictionary['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
    dataframe_dictionary['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
    
    print(pd.DataFrame(dataframe_dictionary))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:13<00:00,  2.80s/it]


            models  metric_scores  stereotype_scores  antistereotype_scores
0  bert-base-cased          100.0              100.0                  100.0


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:13<00:00,  2.80s/it]


              models  metric_scores  stereotype_scores  antistereotype_scores
0    bert-base-cased          100.0              100.0                  100.0
1  bert-base-uncased           60.0               75.0                    0.0


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:39<00:00,  8.00s/it]


               models  metric_scores  stereotype_scores  antistereotype_scores
0     bert-base-cased          100.0              100.0                  100.0
1   bert-base-uncased           60.0               75.0                    0.0
2  bert-large-uncased           80.0               75.0                  100.0


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:38<00:00,  7.70s/it]


               models  metric_scores  stereotype_scores  antistereotype_scores
0     bert-base-cased          100.0              100.0                  100.0
1   bert-base-uncased           60.0               75.0                    0.0
2  bert-large-uncased           80.0               75.0                  100.0
3    bert-large-cased           80.0              100.0                    0.0


Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:17<00:00,  3.47s/it]


                           models  metric_scores  stereotype_scores  \
0                 bert-base-cased          100.0              100.0   
1               bert-base-uncased           60.0               75.0   
2              bert-large-uncased           80.0               75.0   
3                bert-large-cased           80.0              100.0   
4  bert-base-multilingual-uncased           80.0               75.0   

   antistereotype_scores  
0                  100.0  
1                    0.0  
2                  100.0  
3                    0.0  
4                  100.0  


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:19<00:00,  3.82s/it]


                           models  metric_scores  stereotype_scores  \
0                 bert-base-cased          100.0              100.0   
1               bert-base-uncased           60.0               75.0   
2              bert-large-uncased           80.0               75.0   
3                bert-large-cased           80.0              100.0   
4  bert-base-multilingual-uncased           80.0               75.0   
5    bert-base-multilingual-cased           60.0               75.0   

   antistereotype_scores  
0                  100.0  
1                    0.0  
2                  100.0  
3                    0.0  
4                  100.0  
5                    0.0  


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['distilbert.transformer.layer.1.attention.out_lin.weight', 'distilbert.transformer.layer.3.output_layer_norm.bias', 'distilbert.transformer.layer.2.attention.out_lin.weight', 'distilbert.transformer.layer.3.attention.q_lin.bias', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.transformer.layer.1.output_layer_norm.weight', 'distilbert.transformer.layer.5.attention.q_lin.weight', 'distilbert.transformer.layer

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:19<00:00,  3.82s/it]


                               models  metric_scores  stereotype_scores  \
0                     bert-base-cased          100.0              100.0   
1                   bert-base-uncased           60.0               75.0   
2                  bert-large-uncased           80.0               75.0   
3                    bert-large-cased           80.0              100.0   
4      bert-base-multilingual-uncased           80.0               75.0   
5        bert-base-multilingual-cased           60.0               75.0   
6  distilbert-base-multilingual-cased           40.0               25.0   

   antistereotype_scores  
0                  100.0  
1                    0.0  
2                  100.0  
3                    0.0  
4                  100.0  
5                    0.0  
6                  100.0  


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:11<00:00,  2.31s/it]


                               models  metric_scores  stereotype_scores  \
0                     bert-base-cased          100.0              100.0   
1                   bert-base-uncased           60.0               75.0   
2                  bert-large-uncased           80.0               75.0   
3                    bert-large-cased           80.0              100.0   
4      bert-base-multilingual-uncased           80.0               75.0   
5        bert-base-multilingual-cased           60.0               75.0   
6  distilbert-base-multilingual-cased           40.0               25.0   
7    allenai/scibert_scivocab_uncased           40.0               50.0   

   antistereotype_scores  
0                  100.0  
1                    0.0  
2                  100.0  
3                    0.0  
4                  100.0  
5                    0.0  
6                  100.0  
7                    0.0  


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:11<00:00,  2.37s/it]


                               models  metric_scores  stereotype_scores  \
0                     bert-base-cased          100.0              100.0   
1                   bert-base-uncased           60.0               75.0   
2                  bert-large-uncased           80.0               75.0   
3                    bert-large-cased           80.0              100.0   
4      bert-base-multilingual-uncased           80.0               75.0   
5        bert-base-multilingual-cased           60.0               75.0   
6  distilbert-base-multilingual-cased           40.0               25.0   
7    allenai/scibert_scivocab_uncased           40.0               50.0   
8     emilyalsentzer/Bio_ClinicalBERT           40.0               25.0   

   antistereotype_scores  
0                  100.0  
1                    0.0  
2                  100.0  
3                    0.0  
4                  100.0  
5                    0.0  
6                  100.0  
7                    0.0  
8     

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:11<00:00,  2.30s/it]


                                              models  metric_scores  \
0                                    bert-base-cased          100.0   
1                                  bert-base-uncased           60.0   
2                                 bert-large-uncased           80.0   
3                                   bert-large-cased           80.0   
4                     bert-base-multilingual-uncased           80.0   
5                       bert-base-multilingual-cased           60.0   
6                 distilbert-base-multilingual-cased           40.0   
7                   allenai/scibert_scivocab_uncased           40.0   
8                    emilyalsentzer/Bio_ClinicalBERT           40.0   
9  microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...           60.0   

   stereotype_scores  antistereotype_scores  
0              100.0                  100.0  
1               75.0                    0.0  
2               75.0                  100.0  
3              100.0              

Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertForMaskedLM: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this 

                                               models  metric_scores  \
0                                     bert-base-cased          100.0   
1                                   bert-base-uncased           60.0   
2                                  bert-large-uncased           80.0   
3                                    bert-large-cased           80.0   
4                      bert-base-multilingual-uncased           80.0   
5                        bert-base-multilingual-cased           60.0   
6                  distilbert-base-multilingual-cased           40.0   
7                    allenai/scibert_scivocab_uncased           40.0   
8                     emilyalsentzer/Bio_ClinicalBERT           40.0   
9   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...           60.0   
10                                   ProsusAI/finbert           60.0   

    stereotype_scores  antistereotype_scores  
0               100.0                  100.0  
1                75.0                    

Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:12<00:00,  2.47s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          100.0   
1                                   bert-base-uncased           60.0   
2                                  bert-large-uncased           80.0   
3                                    bert-large-cased           80.0   
4                      bert-base-multilingual-uncased           80.0   
5                        bert-base-multilingual-cased           60.0   
6                  distilbert-base-multilingual-cased           40.0   
7                    allenai/scibert_scivocab_uncased           40.0   
8                     emilyalsentzer/Bio_ClinicalBERT           40.0   
9   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...           60.0   
10                                   ProsusAI/finbert           60.0   
11                    nlpaueb/legal-bert-base-uncased          100.0   

    stereotype_scores  antistereotype_scores  
0               

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:12<00:00,  2.42s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          100.0   
1                                   bert-base-uncased           60.0   
2                                  bert-large-uncased           80.0   
3                                    bert-large-cased           80.0   
4                      bert-base-multilingual-uncased           80.0   
5                        bert-base-multilingual-cased           60.0   
6                  distilbert-base-multilingual-cased           40.0   
7                    allenai/scibert_scivocab_uncased           40.0   
8                     emilyalsentzer/Bio_ClinicalBERT           40.0   
9   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...           60.0   
10                                   ProsusAI/finbert           60.0   
11                    nlpaueb/legal-bert-base-uncased          100.0   
12                                    GroNLP/hateBERT           

Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:41<00:00,  8.40s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          100.0   
1                                   bert-base-uncased           60.0   
2                                  bert-large-uncased           80.0   
3                                    bert-large-cased           80.0   
4                      bert-base-multilingual-uncased           80.0   
5                        bert-base-multilingual-cased           60.0   
6                  distilbert-base-multilingual-cased           40.0   
7                    allenai/scibert_scivocab_uncased           40.0   
8                     emilyalsentzer/Bio_ClinicalBERT           40.0   
9   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...           60.0   
10                                   ProsusAI/finbert           60.0   
11                    nlpaueb/legal-bert-base-uncased          100.0   
12                                    GroNLP/hateBERT           

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:07<00:00,  1.58s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          100.0   
1                                   bert-base-uncased           60.0   
2                                  bert-large-uncased           80.0   
3                                    bert-large-cased           80.0   
4                      bert-base-multilingual-uncased           80.0   
5                        bert-base-multilingual-cased           60.0   
6                  distilbert-base-multilingual-cased           40.0   
7                    allenai/scibert_scivocab_uncased           40.0   
8                     emilyalsentzer/Bio_ClinicalBERT           40.0   
9   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...           60.0   
10                                   ProsusAI/finbert           60.0   
11                    nlpaueb/legal-bert-base-uncased          100.0   
12                                    GroNLP/hateBERT           

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.88s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          100.0   
1                                   bert-base-uncased           60.0   
2                                  bert-large-uncased           80.0   
3                                    bert-large-cased           80.0   
4                      bert-base-multilingual-uncased           80.0   
5                        bert-base-multilingual-cased           60.0   
6                  distilbert-base-multilingual-cased           40.0   
7                    allenai/scibert_scivocab_uncased           40.0   
8                     emilyalsentzer/Bio_ClinicalBERT           40.0   
9   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...           60.0   
10                                   ProsusAI/finbert           60.0   
11                    nlpaueb/legal-bert-base-uncased          100.0   
12                                    GroNLP/hateBERT           

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.98s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          100.0   
1                                   bert-base-uncased           60.0   
2                                  bert-large-uncased           80.0   
3                                    bert-large-cased           80.0   
4                      bert-base-multilingual-uncased           80.0   
5                        bert-base-multilingual-cased           60.0   
6                  distilbert-base-multilingual-cased           40.0   
7                    allenai/scibert_scivocab_uncased           40.0   
8                     emilyalsentzer/Bio_ClinicalBERT           40.0   
9   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...           60.0   
10                                   ProsusAI/finbert           60.0   
11                    nlpaueb/legal-bert-base-uncased          100.0   
12                                    GroNLP/hateBERT           

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:15<00:00,  3.07s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          100.0   
1                                   bert-base-uncased           60.0   
2                                  bert-large-uncased           80.0   
3                                    bert-large-cased           80.0   
4                      bert-base-multilingual-uncased           80.0   
5                        bert-base-multilingual-cased           60.0   
6                  distilbert-base-multilingual-cased           40.0   
7                    allenai/scibert_scivocab_uncased           40.0   
8                     emilyalsentzer/Bio_ClinicalBERT           40.0   
9   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...           60.0   
10                                   ProsusAI/finbert           60.0   
11                    nlpaueb/legal-bert-base-uncased          100.0   
12                                    GroNLP/hateBERT           

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:09<00:00,  1.83s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          100.0   
1                                   bert-base-uncased           60.0   
2                                  bert-large-uncased           80.0   
3                                    bert-large-cased           80.0   
4                      bert-base-multilingual-uncased           80.0   
5                        bert-base-multilingual-cased           60.0   
6                  distilbert-base-multilingual-cased           40.0   
7                    allenai/scibert_scivocab_uncased           40.0   
8                     emilyalsentzer/Bio_ClinicalBERT           40.0   
9   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...           60.0   
10                                   ProsusAI/finbert           60.0   
11                    nlpaueb/legal-bert-base-uncased          100.0   
12                                    GroNLP/hateBERT           

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:42<00:00,  8.52s/it]


                                               models  metric_scores  \
0                                     bert-base-cased          100.0   
1                                   bert-base-uncased           60.0   
2                                  bert-large-uncased           80.0   
3                                    bert-large-cased           80.0   
4                      bert-base-multilingual-uncased           80.0   
5                        bert-base-multilingual-cased           60.0   
6                  distilbert-base-multilingual-cased           40.0   
7                    allenai/scibert_scivocab_uncased           40.0   
8                     emilyalsentzer/Bio_ClinicalBERT           40.0   
9   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...           60.0   
10                                   ProsusAI/finbert           60.0   
11                    nlpaueb/legal-bert-base-uncased          100.0   
12                                    GroNLP/hateBERT           

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMRobertaTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.


TypeError: expected str, bytes or os.PathLike object, not NoneType

20 language models were tested (cancelled on XMLRoberta, so it's 20 out of 24)

Timings for 5 sentences (and estimated duration of running entire set) on each model:
* 42s (3.5 hours) - roberta large
* 9s - (45 mins) distil roberta base
* 15s - (1.25 hours) roberta base
* 9s - (45 mins) albert base v2
* 9s - (45 mins) albert base v1
* 7s - (35 mins) security bert
* 41s - (3.4 hours) patents
* 12s - (1 hour) hate bert
* 12s - (1 hour) legal
* 11s - (55 mins) finbert
* 11s - (55 mins) biomed pubmed base uncased abstract
* 11s - (55 mins) bio clinical bert
* 11s - (55 mins) scibert scivocab uncased
* 19s - (1.6 hours) distil bert base multilingual cased
* 19s - (1.6 hours) bert base multilingual cased
* 17s - (1.4 hours) bert base multilingual uncased
* 38s - (3.2 hours) bert large cased
* 39s - (3.3 hours) bert large uncased
* 13s - (1 hour) bert base uncased
* 13s - (1 hour) bert base cased

The sum of all of these are approximately 31 hours

# Evaluating Models on Gender-Biased Data

In [98]:
gender_dataframe_dictionary = {'models' : [],
'metric_scores' : [],
'stereotype_scores' : [],
'antistereotype_scores' : []}

In [99]:
for model_name in all_models:
    logging.basicConfig(level=logging.INFO)

    # load data into panda DataFrame
    df_data = read_data("crows_pairs_anonymized.csv")

    # Filtering to Disability Data
    df_data = df_data[df_data['bias_type']=='gender']
    
    # supported masked language models (using bert)
    if model_name in BERT_models:
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertForMaskedLM.from_pretrained(model_name)
    elif model_name in ALBERT_models:
        tokenizer = AlbertTokenizer.from_pretrained(model_name)
        model = AlbertForMaskedLM.from_pretrained(model_name)
    elif model_name in ROBERTA_models:
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaForMaskedLM.from_pretrained(model_name)
    elif model_name == 'xlm-roberta-base':
        tokenizer = XMLRobertaTokenizer.from_pretrained(model_name)
        model = XMLRobertaForMaskedLM.from_pretrained(model_name)
    elif model_name == 'distilbert-base-multilingual-cased':
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        model = DistilBertForMaskedLM.from_pretrained(model_name)
        
    mask_token = tokenizer.mask_token
    log_softmax = torch.nn.LogSoftmax(dim=0)
    vocab = tokenizer.get_vocab()
    with open("bert" + ".vocab", "w") as f:
        f.write(json.dumps(vocab))

    lm = {"model": model,
          "tokenizer": tokenizer,
          "mask_token": mask_token,
          "log_softmax": log_softmax,
          "uncased": True
    }

    # score each sentence. 
    # each row in the dataframe has the sentid and score for pro and anti stereo.
    df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 
                                     'sent_more_score', 'sent_less_score',
                                     'score', 'stereo_antistereo', 'bias_type'], dtype=object)
    
    total_stereo, total_antistereo = 0, 0
    stereo_score, antistereo_score = 0, 0

    N = 0
    neutral = 0
    total = len(df_data.index)
    with tqdm(total=total) as pbar:
        for index, data in df_data.iterrows():
            direction = data['direction']
            bias = data['bias_type']
            score = mask_unigram(data, lm)

            for stype in score.keys():
                score[stype] = round(score[stype], 3)

            N += 1
            pair_score = 0
            pbar.update(1)
            if score['sent1_score'] == score['sent2_score']:
                neutral += 1
            else:
                if direction == 'stereo':
                    total_stereo += 1
                    if score['sent1_score'] > score['sent2_score']:
                        stereo_score += 1
                        pair_score = 1
                elif direction == 'antistereo':
                    total_antistereo += 1
                    if score['sent2_score'] > score['sent1_score']:
                        antistereo_score += 1
                        pair_score = 1

            sent_more, sent_less = '', ''
            if direction == 'stereo':
                sent_more = data['sent1']
                sent_less = data['sent2']
                sent_more_score = score['sent1_score']
                sent_less_score = score['sent2_score']
            else:
                sent_more = data['sent2']
                sent_less = data['sent1']
                sent_more_score = score['sent2_score']
                sent_less_score = score['sent1_score']

            df_score = df_score.append({'sent_more': sent_more,
                                        'sent_less': sent_less,
                                        'sent_more_score': sent_more_score,
                                        'sent_less_score': sent_less_score,
                                        'score': pair_score,
                                        'stereo_antistereo': direction,
                                        'bias_type': bias
                                      }, ignore_index=True)
    
    gender_dataframe_dictionary['models'].append(model_name)
    gender_dataframe_dictionary['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
    gender_dataframe_dictionary['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
    gender_dataframe_dictionary['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [10:56<00:00,  2.50s/it]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of

Some weights of BertForMaskedLM were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['encoder.layer.7.intermediate.dense.weight', 'encoder.layer.8.attention.self.value.bias', 'encoder.layer.1.attention.output.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.7.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.5.output.LayerNorm.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.11.attention.output.dense.weight', 'encoder.layer.3.attention.self.key.bias', 'encoder.layer.0.output.dense.bias', 'encoder.layer.5.attention.self.key.bias', 'encoder.layer.3.attention.self.value.bias', 'encoder.layer.11.attention.self.value.bias', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.8.attention.self.value.weight', 'encoder.layer.8.attention.self.query.weight', 'encoder.layer.9.intermediate.dense.bias', 'encoder.layer.7.attention.self.value.bias', 'encoder.l

100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [18:56<00:00,  4.34s/it]
Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [12:19<00:00,  2.82s/it]
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForM

TypeError: expected str, bytes or os.PathLike object, not NoneType

10:56 - 'bert-base-cased',

11:12 - 'bert-base-uncased',

36:06 - 'bert-large-uncased',

37:20 - 'bert-large-cased',

17:39 - 'bert-base-multilingual-uncased',

18:55 - 'bert-base-multilingual-cased',

18:56 - 'distilbert-base-multilingual-cased', # still confused about whether its BertTokenizer or DistilBertTokenizer

12:19 - 'allenai/scibert_scivocab_uncased',

11:25 - 'emilyalsentzer/Bio_ClinicalBERT',cd D

12:40 - 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',

11:13 - 'ProsusAI/finbert',

12:34 - 'nlpaueb/legal-bert-base-uncased',

11:16 - 'GroNLP/hateBERT',

38:43 - 'anferico/bert-for-patents',

09:04 - 'jackaduma/SecBERT',

09:54 - 'albert-base-v1',

10:54 - 'albert-base-v2',

13:44 - 'roberta-base',

08:39 - 'distilroberta-base',

40:19 - 'roberta-large',

27:52 - 'xlm-roberta-base',

10:23 - 'huggingface/CodeBERTa-small-v1',

OMITTED - 'vinai/bertweet-base',

08:34 - 'climatebert/distilroberta-base-climate-f'

In [101]:
pd.DataFrame(gender_dataframe_dictionary)

Unnamed: 0,models,metric_scores,stereotype_scores,antistereotype_scores
0,bert-base-cased,55.73,57.86,52.43
1,bert-base-uncased,58.02,55.35,62.14
2,bert-large-uncased,55.34,54.72,56.31
3,bert-large-cased,52.29,55.35,47.57
4,bert-base-multilingual-uncased,53.05,52.83,53.4
5,bert-base-multilingual-cased,45.04,46.54,42.72
6,distilbert-base-multilingual-cased,57.25,65.41,44.66
7,allenai/scibert_scivocab_uncased,44.27,35.85,57.28
8,emilyalsentzer/Bio_ClinicalBERT,50.0,48.43,52.43
9,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,51.91,50.94,53.4


This loop completed 20 out of 24 models, and failed as the XLM model should be run using the XLM RobertaTokenizer, but was accidentally left in the ROBERTA_models list.

The models omitted from the dictionary scores will be calculated below:

In [5]:
gender_dataframe_dictionary2 = {
    'models' : [],
    'metric_scores' : [],
    'stereotype_scores' : [],
    'antistereotype_scores' : []
}

In [6]:
missing_models = [
    'xlm-roberta-base',
    'huggingface/CodeBERTa-small-v1',
    'vinai/bertweet-base',
    'climatebert/distilroberta-base-climate-f'
]

In [16]:
for model_name in missing_models:
    logging.basicConfig(level=logging.INFO)

    # load data into panda DataFrame
    df_data = read_data("crows_pairs_anonymized.csv")

    # Filtering to Disability Data
    df_data = df_data[df_data['bias_type']=='gender']
    
    # supported masked language models (using bert)
    if model_name in BERT_models:
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertForMaskedLM.from_pretrained(model_name)
    elif model_name in ALBERT_models:
        tokenizer = AlbertTokenizer.from_pretrained(model_name)
        model = AlbertForMaskedLM.from_pretrained(model_name)
    elif model_name in ROBERTA_models:
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaForMaskedLM.from_pretrained(model_name)
    elif model_name == 'xlm-roberta-base':
        tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
        model = XLMRobertaForMaskedLM.from_pretrained(model_name)
    elif model_name == 'distilbert-base-multilingual-cased':
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        model = DistilBertForMaskedLM.from_pretrained(model_name)
        
    mask_token = tokenizer.mask_token
    log_softmax = torch.nn.LogSoftmax(dim=0)
    vocab = tokenizer.get_vocab()
    with open("bert" + ".vocab", "w") as f:
        f.write(json.dumps(vocab))

    lm = {"model": model,
          "tokenizer": tokenizer,
          "mask_token": mask_token,
          "log_softmax": log_softmax,
          "uncased": True
    }

    # score each sentence. 
    # each row in the dataframe has the sentid and score for pro and anti stereo.
    df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 
                                     'sent_more_score', 'sent_less_score',
                                     'score', 'stereo_antistereo', 'bias_type'], dtype=object)
    
    total_stereo, total_antistereo = 0, 0
    stereo_score, antistereo_score = 0, 0

    N = 0
    neutral = 0
    total = len(df_data.index)
    with tqdm(total=total) as pbar:
        for index, data in df_data.iterrows():
            direction = data['direction']
            bias = data['bias_type']
            score = mask_unigram(data, lm)

            for stype in score.keys():
                score[stype] = round(score[stype], 3)

            N += 1
            pair_score = 0
            pbar.update(1)
            if score['sent1_score'] == score['sent2_score']:
                neutral += 1
            else:
                if direction == 'stereo':
                    total_stereo += 1
                    if score['sent1_score'] > score['sent2_score']:
                        stereo_score += 1
                        pair_score = 1
                elif direction == 'antistereo':
                    total_antistereo += 1
                    if score['sent2_score'] > score['sent1_score']:
                        antistereo_score += 1
                        pair_score = 1

            sent_more, sent_less = '', ''
            if direction == 'stereo':
                sent_more = data['sent1']
                sent_less = data['sent2']
                sent_more_score = score['sent1_score']
                sent_less_score = score['sent2_score']
            else:
                sent_more = data['sent2']
                sent_less = data['sent1']
                sent_more_score = score['sent2_score']
                sent_less_score = score['sent1_score']

            df_score = df_score.append({'sent_more': sent_more,
                                        'sent_less': sent_less,
                                        'sent_more_score': sent_more_score,
                                        'sent_less_score': sent_less_score,
                                        'score': pair_score,
                                        'stereo_antistereo': direction,
                                        'bias_type': bias
                                      }, ignore_index=True)
    
    gender_dataframe_dictionary2['models'].append(model_name)
    gender_dataframe_dictionary2['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
    gender_dataframe_dictionary2['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
    gender_dataframe_dictionary2['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))

100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [27:52<00:00,  6.38s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [10:23<00:00,  2.38s/it]


OSError: Can't load tokenizer for 'vinai/bertweet-base'. Make sure that:

- 'vinai/bertweet-base' is a correct model identifier listed on 'https://huggingface.co/models'
  (make sure 'vinai/bertweet-base' is not a path to a local directory with something else, in that case)

- or 'vinai/bertweet-base' is the correct path to a directory containing relevant tokenizer files



bertweet-base does not seem to work with CrowS-Pairs dataset, so this will be omitted from the studies.

The only model left to test is 'climatebert/distilroberta-base-climate-f'.

In [17]:
logging.basicConfig(level=logging.INFO)

# load data into panda DataFrame
df_data = read_data("crows_pairs_anonymized.csv")

# Filtering to Disability Data
df_data = df_data[df_data['bias_type']=='gender']

tokenizer = RobertaTokenizer.from_pretrained('climatebert/distilroberta-base-climate-f')
model = RobertaForMaskedLM.from_pretrained('climatebert/distilroberta-base-climate-f')

mask_token = tokenizer.mask_token
log_softmax = torch.nn.LogSoftmax(dim=0)
vocab = tokenizer.get_vocab()
with open("bert" + ".vocab", "w") as f:
    f.write(json.dumps(vocab))

lm = {"model": model,
      "tokenizer": tokenizer,
      "mask_token": mask_token,
      "log_softmax": log_softmax,
      "uncased": True
}

# score each sentence. 
# each row in the dataframe has the sentid and score for pro and anti stereo.
df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 
                                 'sent_more_score', 'sent_less_score',
                                 'score', 'stereo_antistereo', 'bias_type'], dtype=object)

total_stereo, total_antistereo = 0, 0
stereo_score, antistereo_score = 0, 0

N = 0
neutral = 0
total = len(df_data.index)
with tqdm(total=total) as pbar:
    for index, data in df_data.iterrows():
        direction = data['direction']
        bias = data['bias_type']
        score = mask_unigram(data, lm)

        for stype in score.keys():
            score[stype] = round(score[stype], 3)

        N += 1
        pair_score = 0
        pbar.update(1)
        if score['sent1_score'] == score['sent2_score']:
            neutral += 1
        else:
            if direction == 'stereo':
                total_stereo += 1
                if score['sent1_score'] > score['sent2_score']:
                    stereo_score += 1
                    pair_score = 1
            elif direction == 'antistereo':
                total_antistereo += 1
                if score['sent2_score'] > score['sent1_score']:
                    antistereo_score += 1
                    pair_score = 1

        sent_more, sent_less = '', ''
        if direction == 'stereo':
            sent_more = data['sent1']
            sent_less = data['sent2']
            sent_more_score = score['sent1_score']
            sent_less_score = score['sent2_score']
        else:
            sent_more = data['sent2']
            sent_less = data['sent1']
            sent_more_score = score['sent2_score']
            sent_less_score = score['sent1_score']

        df_score = df_score.append({'sent_more': sent_more,
                                    'sent_less': sent_less,
                                    'sent_more_score': sent_more_score,
                                    'sent_less_score': sent_less_score,
                                    'score': pair_score,
                                    'stereo_antistereo': direction,
                                    'bias_type': bias
                                  }, ignore_index=True)

gender_dataframe_dictionary2['models'].append(model_name)
gender_dataframe_dictionary2['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
gender_dataframe_dictionary2['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
gender_dataframe_dictionary2['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))

100%|████████████████████████████████████████████████████████████████████████████████| 262/262 [08:34<00:00,  1.97s/it]


In [19]:
pd.DataFrame(gender_dataframe_dictionary2)

Unnamed: 0,models,metric_scores,stereotype_scores,antistereotype_scores
0,xlm-roberta-base,50.38,50.94,49.51
1,huggingface/CodeBERTa-small-v1,55.73,50.94,63.11
2,vinai/bertweet-base,51.15,50.31,52.43


In [24]:
all_models

['bert-base-cased',
 'bert-base-uncased',
 'bert-large-uncased',
 'bert-large-cased',
 'bert-base-multilingual-uncased',
 'bert-base-multilingual-cased',
 'allenai/scibert_scivocab_uncased',
 'emilyalsentzer/Bio_ClinicalBERT',
 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
 'ProsusAI/finbert',
 'nlpaueb/legal-bert-base-uncased',
 'GroNLP/hateBERT',
 'anferico/bert-for-patents',
 'jackaduma/SecBERT',
 'albert-base-v1',
 'albert-base-v2',
 'roberta-base',
 'distilroberta-base',
 'roberta-large',
 'huggingface/CodeBERTa-small-v1',
 'climatebert/distilroberta-base-climate-f',
 'xlm-roberta-base',
 'distilbert-base-multilingual-cased']

In [25]:
gender_dataframe = {
    'models' : all_models,
    'metric_scores' : [55.73, 58.02, 55.34, 
                       52.29, 53.05, 45.04, 
                       44.27, 50, 51.91, 
                       45.8, 51.53, 52.67, 
                       45.42, 46.56, 53.44, 
                       54.2, 54.96, 53.82, 
                       51.91, 55.73, 51.15, 
                       50.38, 57.25],
    'stereotype_scores' : [57.86, 55.35, 54.72, 
                           55.35, 52.83, 46.54, 
                           35.85, 48.43, 50.94, 
                           44.65, 50.31, 49.69, 
                           45.28, 40.25, 52.83, 
                           47.17, 59.12, 60.38, 
                           55.97, 50.94, 50.31, 
                           50.94, 65.41],
    'antistereotype_scores' : [52.43, 62.14, 56.31, 
                               47.57, 53.4, 42.72, 
                               57.28, 52.43, 53.4, 
                               47.57, 53.4, 57.28, 
                               45.63, 56.31, 54.37, 
                               65.05, 48.54, 43.69, 
                               45.63, 63.11, 52.43, 
                               49.51, 44.66]
}

In [26]:
pd.DataFrame(gender_dataframe)

Unnamed: 0,models,metric_scores,stereotype_scores,antistereotype_scores
0,bert-base-cased,55.73,57.86,52.43
1,bert-base-uncased,58.02,55.35,62.14
2,bert-large-uncased,55.34,54.72,56.31
3,bert-large-cased,52.29,55.35,47.57
4,bert-base-multilingual-uncased,53.05,52.83,53.4
5,bert-base-multilingual-cased,45.04,46.54,42.72
6,allenai/scibert_scivocab_uncased,44.27,35.85,57.28
7,emilyalsentzer/Bio_ClinicalBERT,50.0,48.43,52.43
8,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,51.91,50.94,53.4
9,ProsusAI/finbert,45.8,44.65,47.57
