# Testing All Models on Adjusted Dataset

In [1]:
import os
import csv
import json
import math
import torch
import argparse
import difflib
import logging
import numpy as np
import pandas as pd

from transformers import BertTokenizer, BertForMaskedLM
from transformers import AlbertTokenizer, AlbertForMaskedLM
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from transformers import DistilBertTokenizer, DistilBertForMaskedLM
from transformers import AutoModel, AutoTokenizer
# 
from collections import defaultdict
from tqdm import tqdm

from crows_pairs_methods import *

In [2]:
BERT_models = [
    'bert-base-cased',
    'bert-base-uncased',
    'bert-large-uncased',
    'bert-large-cased',
    'bert-base-multilingual-uncased',
    'bert-base-multilingual-cased',
    'allenai/scibert_scivocab_uncased',
    'emilyalsentzer/Bio_ClinicalBERT',
    'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract',
    'ProsusAI/finbert',
    'nlpaueb/legal-bert-base-uncased',
    'GroNLP/hateBERT',
    'anferico/bert-for-patents',
    'jackaduma/SecBERT'
]

ALBERT_models = [
    'albert-base-v1',
    'albert-base-v2'
]

ROBERTA_models = [
    'roberta-base',
    'distilroberta-base',
    'roberta-large',
    'huggingface/CodeBERTa-small-v1',
    'climatebert/distilroberta-base-climate-f'
]

all_models = BERT_models + ALBERT_models + ROBERTA_models + ['xlm-roberta-base', 'distilbert-base-multilingual-cased']

In [12]:
bias_types = [
    'Race',
    'Gender',
    'Socio-Economic', 
    'Nationality', 
    'Religion', 
    'Age', 
    'Sexual Orientation', 
    'Physical Appearance', 
    'Disability'
]

## Testing on Racially Biased Data

In [9]:
dataframe_dictionary = {'models' : [],
'metric_scores' : [],
'stereotype_scores' : [],
'antistereotype_scores' : []}

In [10]:
logging.basicConfig(level=logging.INFO)

# load data into panda DataFrame
df_data = read_data("fixed_data.csv")

# Filtering to Disability Data
df_data = df_data[df_data['bias_type']=='disability'][:2]

for model_name in all_models:
    
    # supported masked language models (using bert)
    if model_name in BERT_models:
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertForMaskedLM.from_pretrained(model_name)
    elif model_name in ALBERT_models:
        tokenizer = AlbertTokenizer.from_pretrained(model_name)
        model = AlbertForMaskedLM.from_pretrained(model_name)
    elif model_name in ROBERTA_models:
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaForMaskedLM.from_pretrained(model_name)
    elif model_name == 'xlm-roberta-base':
        tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
        model = XLMRobertaForMaskedLM.from_pretrained(model_name)
    elif model_name == 'distilbert-base-multilingual-cased':
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        model = DistilBertForMaskedLM.from_pretrained(model_name)
        
    mask_token = tokenizer.mask_token
    log_softmax = torch.nn.LogSoftmax(dim=0)
    vocab = tokenizer.get_vocab()
    with open("bert" + ".vocab", "w") as f:
        f.write(json.dumps(vocab))

    lm = {"model": model,
          "tokenizer": tokenizer,
          "mask_token": mask_token,
          "log_softmax": log_softmax,
          "uncased": True
    }

    # score each sentence. 
    # each row in the dataframe has the sentid and score for pro and anti stereo.
    df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 
                                     'sent_more_score', 'sent_less_score',
                                     'score', 'stereo_antistereo', 'bias_type'], dtype=object)
    
    total_stereo, total_antistereo = 0, 0
    stereo_score, antistereo_score = 0, 0

    N = 0
    neutral = 0
    total = len(df_data.index)
    with tqdm(total=total) as pbar:
        for index, data in df_data.iterrows():
            direction = data['direction']
            bias = data['bias_type']
            score = mask_unigram(data, lm)

            for stype in score.keys():
                score[stype] = round(score[stype], 3)

            N += 1
            pair_score = 0
            pbar.update(1)
            if score['sent1_score'] == score['sent2_score']:
                neutral += 1
            else:
                if direction == 'stereo':
                    total_stereo += 1
                    if score['sent1_score'] > score['sent2_score']:
                        stereo_score += 1
                        pair_score = 1
                elif direction == 'antistereo':
                    total_antistereo += 1
                    if score['sent2_score'] > score['sent1_score']:
                        antistereo_score += 1
                        pair_score = 1

            sent_more, sent_less = '', ''
            if direction == 'stereo':
                sent_more = data['sent1']
                sent_less = data['sent2']
                sent_more_score = score['sent1_score']
                sent_less_score = score['sent2_score']
            else:
                sent_more = data['sent2']
                sent_less = data['sent1']
                sent_more_score = score['sent2_score']
                sent_less_score = score['sent1_score']

            df_score = df_score.append({'sent_more': sent_more,
                                        'sent_less': sent_less,
                                        'sent_more_score': sent_more_score,
                                        'sent_less_score': sent_less_score,
                                        'score': pair_score,
                                        'stereo_antistereo': direction,
                                        'bias_type': bias
                                      }, ignore_index=True)
    
    dataframe_dictionary['models'].append(model_name)
    dataframe_dictionary['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
    dataframe_dictionary['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
    dataframe_dictionary['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.21s/it]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.16s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.26s/it]
Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:26<00:00, 13.17s/it]

In [11]:
# 162 seconds to complete above 
print(pd.DataFrame(dataframe_dictionary))

                                               models  metric_scores  \
0                                     bert-base-cased          100.0   
1                                   bert-base-uncased           50.0   
2                                  bert-large-uncased          100.0   
3                                    bert-large-cased           50.0   
4                      bert-base-multilingual-uncased           50.0   
5                        bert-base-multilingual-cased            0.0   
6                    allenai/scibert_scivocab_uncased            0.0   
7                     emilyalsentzer/Bio_ClinicalBERT           50.0   
8   microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...            0.0   
9                                    ProsusAI/finbert          100.0   
10                    nlpaueb/legal-bert-base-uncased          100.0   
11                                    GroNLP/hateBERT           50.0   
12                          anferico/bert-for-patents           

# Creating Loop to Store Scores for Types of Bias

In [34]:
bias_types = [
    'Race',
    'Gender',
    'Socio-Economic', 
    'Nationality', 
    'Religion', 
    'Age', 
    'Sexual Orientation', 
    'Physical Appearance', 
    'Disability'
]

bias_types = [
    'race-color',
    'gender',
    'socioeconomic',
    'nationality',
    'religion', 
    'age',
    'sexual-orientation',
    'physical-appearance',
    'disability'
]

In [38]:
empty_dataframe_dictionary = {
    'models' : [],
    'metric_scores' : [],
    'stereotype_scores' : [],
    'antistereotype_scores' : []
}

dataframe_dictionary_race = empty_dataframe_dictionary
dataframe_dictionary_gender = empty_dataframe_dictionary
dataframe_dictionary_socioeconomic = empty_dataframe_dictionary
dataframe_dictionary_nationality = empty_dataframe_dictionary
dataframe_dictionary_religion = empty_dataframe_dictionary
dataframe_dictionary_age = empty_dataframe_dictionary
dataframe_dictionary_sexualorientation = empty_dataframe_dictionary
dataframe_dictionary_physicalappearance = empty_dataframe_dictionary
dataframe_dictionary_disability = empty_dataframe_dictionary

In [46]:
logging.basicConfig(level=logging.INFO)

for bias_type in bias_types:
    
    # load data into panda DataFrame
    df_data = read_data("fixed_data.csv")

    # Filtering to Disability Data
    df_data = df_data[df_data['bias_type']==bias_type][:10]

    for model_name in all_models:

        # supported masked language models (using bert)
        if model_name in BERT_models:
            tokenizer = BertTokenizer.from_pretrained(model_name)
            model = BertForMaskedLM.from_pretrained(model_name)
        elif model_name in ALBERT_models:
            tokenizer = AlbertTokenizer.from_pretrained(model_name)
            model = AlbertForMaskedLM.from_pretrained(model_name)
        elif model_name in ROBERTA_models:
            tokenizer = RobertaTokenizer.from_pretrained(model_name)
            model = RobertaForMaskedLM.from_pretrained(model_name)
        elif model_name == 'xlm-roberta-base':
            tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
            model = XLMRobertaForMaskedLM.from_pretrained(model_name)
        elif model_name == 'distilbert-base-multilingual-cased':
            tokenizer = DistilBertTokenizer.from_pretrained(model_name)
            model = DistilBertForMaskedLM.from_pretrained(model_name)

        mask_token = tokenizer.mask_token
        log_softmax = torch.nn.LogSoftmax(dim=0)
        vocab = tokenizer.get_vocab()
        with open("bert" + ".vocab", "w") as f:
            f.write(json.dumps(vocab))

        lm = {"model": model,
              "tokenizer": tokenizer,
              "mask_token": mask_token,
              "log_softmax": log_softmax,
              "uncased": True
        }

        # score each sentence. 
        # each row in the dataframe has the sentid and score for pro and anti stereo.
        df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 
                                         'sent_more_score', 'sent_less_score',
                                         'score', 'stereo_antistereo', 'bias_type'], dtype=object)

        total_stereo, total_antistereo = 0, 0
        stereo_score, antistereo_score = 0, 0

        N = 0
        neutral = 0
        total = len(df_data.index)
        with tqdm(total=total) as pbar:
            for index, data in df_data.iterrows():
                direction = data['direction']
                bias = data['bias_type']
                score = mask_unigram(data, lm)

                for stype in score.keys():
                    score[stype] = round(score[stype], 3)

                N += 1
                pair_score = 0
                pbar.update(1)
                if score['sent1_score'] == score['sent2_score']:
                    neutral += 1
                else:
                    if direction == 'stereo':
                        total_stereo += 1
                        if score['sent1_score'] > score['sent2_score']:
                            stereo_score += 1
                            pair_score = 1
                    elif direction == 'antistereo':
                        total_antistereo += 1
                        if score['sent2_score'] > score['sent1_score']:
                            antistereo_score += 1
                            pair_score = 1

                sent_more, sent_less = '', ''
                if direction == 'stereo':
                    sent_more = data['sent1']
                    sent_less = data['sent2']
                    sent_more_score = score['sent1_score']
                    sent_less_score = score['sent2_score']
                else:
                    sent_more = data['sent2']
                    sent_less = data['sent1']
                    sent_more_score = score['sent2_score']
                    sent_less_score = score['sent1_score']

                df_score = df_score.append({'sent_more': sent_more,
                                            'sent_less': sent_less,
                                            'sent_more_score': sent_more_score,
                                            'sent_less_score': sent_less_score,
                                            'score': pair_score,
                                            'stereo_antistereo': direction,
                                            'bias_type': bias
                                          }, ignore_index=True)

        if bias_type == 'race-color':
            dataframe_dictionary_race['models'].append(model_name)
            dataframe_dictionary_race['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_race['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_race['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'gender':
            dataframe_dictionary_gender['models'].append(model_name)
            dataframe_dictionary_gender['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_gender['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_gender['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'socioeconomic':
            dataframe_dictionary_socioeconomic['models'].append(model_name)
            dataframe_dictionary_socioeconomic['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_socioeconomic['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_socioeconomic['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'nationality':
            dataframe_dictionary_nationality['models'].append(model_name)
            dataframe_dictionary_nationality['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_nationality['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_nationality['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'religion':
            dataframe_dictionary_religion['models'].append(model_name)
            dataframe_dictionary_religion['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_religion['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_religion['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'age':
            dataframe_dictionary_age['models'].append(model_name)
            dataframe_dictionary_age['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_age['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_age['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'sexual-orientation':
            dataframe_dictionary_sexualorientation['models'].append(model_name)
            dataframe_dictionary_sexualorientation['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_sexualorientation['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_sexualorientation['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'physical-appearance':
            dataframe_dictionary_physicalappearance['models'].append(model_name)
            dataframe_dictionary_physicalappearance['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_physicalappearance ['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_physicalappearance  ['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))          
        
        elif bias_type == 'disability':
            dataframe_dictionary_disability['models'].append(model_name)
            dataframe_dictionary_disability['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_disability['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_disability['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:29<00:00,  2.92s/it]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:35<00:00,  3.55s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:32<00:00,  3.26s/it]
Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:49<00:00, 10.93s/it]

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:23<00:00,  2.33s/it]
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.68s/it]
Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing Ber

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:33<00:00,  3.40s/it]
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:36<00:00,  3.64s/it]
Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForMask

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.87s/it]
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:30<00:00,  9.08s/it]
Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationsh

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:49<00:00, 10.91s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:19<00:00,  1.92s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:21<00:00,  2.18s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:22<00:00,  2.28s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:28<00:00,  2.87s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.88s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:37<00:00,  9.73s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:21<00:00,  2.20s/it]
100%|███████████████████████████████████

Some weights of BertForMaskedLM were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:30<00:00,  3.02s/it]
Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining mode

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:24<00:00,  2.44s/it]
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:23<00:00,  2.39s/it]
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when i

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:38<00:00,  9.85s/it]
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:41<00:00,  4.14s/it]
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMasked

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:27<00:00,  2.72s/it]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.67s/it]
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relations

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:30<00:00,  3.08s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:26<00:00,  2.69s/it]
Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [01:36<00:00,  9.61s/it]

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:21<00:00,  2.16s/it]
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:22<00:00,  2.24s/it]
Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing Ber

In [47]:
print(dataframe_dictionary_race)
print(dataframe_dictionary_gender)
print(dataframe_dictionary_socioeconomic)
print(dataframe_dictionary_nationality)
print(dataframe_dictionary_religion)
print(dataframe_dictionary_age)
print(dataframe_dictionary_sexualorientation)
print(dataframe_dictionary_physicalappearance)
print(dataframe_dictionary_disability)

{'models': ['bert-base-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateBERT', 'anferico/bert-for-patents', 'jackaduma/SecBERT', 'albert-base-v1', 'albert-base-v2', 'roberta-base', 'distilroberta-base', 'roberta-large', 'huggingface/CodeBERTa-small-v1', 'climatebert/distilroberta-base-climate-f', 'xlm-roberta-base', 'distilbert-base-multilingual-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateB




{'models': ['bert-base-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateBERT', 'anferico/bert-for-patents', 'jackaduma/SecBERT', 'albert-base-v1', 'albert-base-v2', 'roberta-base', 'distilroberta-base', 'roberta-large', 'huggingface/CodeBERTa-small-v1', 'climatebert/distilroberta-base-climate-f', 'xlm-roberta-base', 'distilbert-base-multilingual-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateB

{'models': ['bert-base-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateBERT', 'anferico/bert-for-patents', 'jackaduma/SecBERT', 'albert-base-v1', 'albert-base-v2', 'roberta-base', 'distilroberta-base', 'roberta-large', 'huggingface/CodeBERTa-small-v1', 'climatebert/distilroberta-base-climate-f', 'xlm-roberta-base', 'distilbert-base-multilingual-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateB




{'models': ['bert-base-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateBERT', 'anferico/bert-for-patents', 'jackaduma/SecBERT', 'albert-base-v1', 'albert-base-v2', 'roberta-base', 'distilroberta-base', 'roberta-large', 'huggingface/CodeBERTa-small-v1', 'climatebert/distilroberta-base-climate-f', 'xlm-roberta-base', 'distilbert-base-multilingual-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateB

{'models': ['bert-base-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateBERT', 'anferico/bert-for-patents', 'jackaduma/SecBERT', 'albert-base-v1', 'albert-base-v2', 'roberta-base', 'distilroberta-base', 'roberta-large', 'huggingface/CodeBERTa-small-v1', 'climatebert/distilroberta-base-climate-f', 'xlm-roberta-base', 'distilbert-base-multilingual-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateB




{'models': ['bert-base-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateBERT', 'anferico/bert-for-patents', 'jackaduma/SecBERT', 'albert-base-v1', 'albert-base-v2', 'roberta-base', 'distilroberta-base', 'roberta-large', 'huggingface/CodeBERTa-small-v1', 'climatebert/distilroberta-base-climate-f', 'xlm-roberta-base', 'distilbert-base-multilingual-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateB

{'models': ['bert-base-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateBERT', 'anferico/bert-for-patents', 'jackaduma/SecBERT', 'albert-base-v1', 'albert-base-v2', 'roberta-base', 'distilroberta-base', 'roberta-large', 'huggingface/CodeBERTa-small-v1', 'climatebert/distilroberta-base-climate-f', 'xlm-roberta-base', 'distilbert-base-multilingual-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateB




{'models': ['bert-base-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateBERT', 'anferico/bert-for-patents', 'jackaduma/SecBERT', 'albert-base-v1', 'albert-base-v2', 'roberta-base', 'distilroberta-base', 'roberta-large', 'huggingface/CodeBERTa-small-v1', 'climatebert/distilroberta-base-climate-f', 'xlm-roberta-base', 'distilbert-base-multilingual-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateB

{'models': ['bert-base-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateBERT', 'anferico/bert-for-patents', 'jackaduma/SecBERT', 'albert-base-v1', 'albert-base-v2', 'roberta-base', 'distilroberta-base', 'roberta-large', 'huggingface/CodeBERTa-small-v1', 'climatebert/distilroberta-base-climate-f', 'xlm-roberta-base', 'distilbert-base-multilingual-cased', 'bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateB




208 runs (black bars)

## Attempting Above Loop for All Sentences (35hr wait approx.)

Started at 2:15am

In [48]:
empty_dataframe_dictionary = {
    'models' : [],
    'metric_scores' : [],
    'stereotype_scores' : [],
    'antistereotype_scores' : []
}

dataframe_dictionary_race = empty_dataframe_dictionary
dataframe_dictionary_gender = empty_dataframe_dictionary
dataframe_dictionary_socioeconomic = empty_dataframe_dictionary
dataframe_dictionary_nationality = empty_dataframe_dictionary
dataframe_dictionary_religion = empty_dataframe_dictionary
dataframe_dictionary_age = empty_dataframe_dictionary
dataframe_dictionary_sexualorientation = empty_dataframe_dictionary
dataframe_dictionary_physicalappearance = empty_dataframe_dictionary
dataframe_dictionary_disability = empty_dataframe_dictionary

In [49]:
logging.basicConfig(level=logging.INFO)

for bias_type in bias_types:
    
    # load data into panda DataFrame
    df_data = read_data("fixed_data.csv")

    # Filtering to Disability Data
    df_data = df_data[df_data['bias_type']==bias_type]

    for model_name in all_models:

        # supported masked language models (using bert)
        if model_name in BERT_models:
            tokenizer = BertTokenizer.from_pretrained(model_name)
            model = BertForMaskedLM.from_pretrained(model_name)
        elif model_name in ALBERT_models:
            tokenizer = AlbertTokenizer.from_pretrained(model_name)
            model = AlbertForMaskedLM.from_pretrained(model_name)
        elif model_name in ROBERTA_models:
            tokenizer = RobertaTokenizer.from_pretrained(model_name)
            model = RobertaForMaskedLM.from_pretrained(model_name)
        elif model_name == 'xlm-roberta-base':
            tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
            model = XLMRobertaForMaskedLM.from_pretrained(model_name)
        elif model_name == 'distilbert-base-multilingual-cased':
            tokenizer = DistilBertTokenizer.from_pretrained(model_name)
            model = DistilBertForMaskedLM.from_pretrained(model_name)

        mask_token = tokenizer.mask_token
        log_softmax = torch.nn.LogSoftmax(dim=0)
        vocab = tokenizer.get_vocab()
        with open("bert" + ".vocab", "w") as f:
            f.write(json.dumps(vocab))

        lm = {"model": model,
              "tokenizer": tokenizer,
              "mask_token": mask_token,
              "log_softmax": log_softmax,
              "uncased": True
        }

        # score each sentence. 
        # each row in the dataframe has the sentid and score for pro and anti stereo.
        df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 
                                         'sent_more_score', 'sent_less_score',
                                         'score', 'stereo_antistereo', 'bias_type'], dtype=object)

        total_stereo, total_antistereo = 0, 0
        stereo_score, antistereo_score = 0, 0

        N = 0
        neutral = 0
        total = len(df_data.index)
        with tqdm(total=total) as pbar:
            for index, data in df_data.iterrows():
                direction = data['direction']
                bias = data['bias_type']
                score = mask_unigram(data, lm)

                for stype in score.keys():
                    score[stype] = round(score[stype], 3)

                N += 1
                pair_score = 0
                pbar.update(1)
                if score['sent1_score'] == score['sent2_score']:
                    neutral += 1
                else:
                    if direction == 'stereo':
                        total_stereo += 1
                        if score['sent1_score'] > score['sent2_score']:
                            stereo_score += 1
                            pair_score = 1
                    elif direction == 'antistereo':
                        total_antistereo += 1
                        if score['sent2_score'] > score['sent1_score']:
                            antistereo_score += 1
                            pair_score = 1

                sent_more, sent_less = '', ''
                if direction == 'stereo':
                    sent_more = data['sent1']
                    sent_less = data['sent2']
                    sent_more_score = score['sent1_score']
                    sent_less_score = score['sent2_score']
                else:
                    sent_more = data['sent2']
                    sent_less = data['sent1']
                    sent_more_score = score['sent2_score']
                    sent_less_score = score['sent1_score']

                df_score = df_score.append({'sent_more': sent_more,
                                            'sent_less': sent_less,
                                            'sent_more_score': sent_more_score,
                                            'sent_less_score': sent_less_score,
                                            'score': pair_score,
                                            'stereo_antistereo': direction,
                                            'bias_type': bias
                                          }, ignore_index=True)

        if bias_type == 'race-color':
            dataframe_dictionary_race['models'].append(model_name)
            dataframe_dictionary_race['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_race['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_race['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'gender':
            dataframe_dictionary_gender['models'].append(model_name)
            dataframe_dictionary_gender['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_gender['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_gender['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'socioeconomic':
            dataframe_dictionary_socioeconomic['models'].append(model_name)
            dataframe_dictionary_socioeconomic['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_socioeconomic['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_socioeconomic['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'nationality':
            dataframe_dictionary_nationality['models'].append(model_name)
            dataframe_dictionary_nationality['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_nationality['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_nationality['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'religion':
            dataframe_dictionary_religion['models'].append(model_name)
            dataframe_dictionary_religion['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_religion['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_religion['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'age':
            dataframe_dictionary_age['models'].append(model_name)
            dataframe_dictionary_age['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_age['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_age['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'sexual-orientation':
            dataframe_dictionary_sexualorientation['models'].append(model_name)
            dataframe_dictionary_sexualorientation['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_sexualorientation['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_sexualorientation['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))
        
        elif bias_type == 'physical-appearance':
            dataframe_dictionary_physicalappearance['models'].append(model_name)
            dataframe_dictionary_physicalappearance['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_physicalappearance ['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_physicalappearance  ['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))          
        
        elif bias_type == 'disability':
            dataframe_dictionary_disability['models'].append(model_name)
            dataframe_dictionary_disability['metric_scores'].append(round((stereo_score + antistereo_score) / N * 100, 2))
            dataframe_dictionary_disability['stereotype_scores'].append(round(stereo_score  / total_stereo * 100, 2))
            dataframe_dictionary_disability['antistereotype_scores'].append(round(antistereo_score  / total_antistereo * 100, 2))

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████████| 516/516 [24:58<00:00,  2.90s/it]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of

100%|████████████████████████████████████████████████████████████████████████████████| 516/516 [27:57<00:00,  3.25s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 516/516 [24:47<00:00,  2.88s/it]
Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████| 516/516 [1:24:41<00:00,  9.85s/it]

KeyboardInterrupt: 

The model has been loading for 14 hours and is less than 10% through the models.

There must be an issue in the code, let's terminate and view the dataframes.

In [None]:
print(dataframe_dictionary_race)
print(dataframe_dictionary_gender)
print(dataframe_dictionary_socioeconomic)
print(dataframe_dictionary_nationality)
print(dataframe_dictionary_religion)
print(dataframe_dictionary_age)
print(dataframe_dictionary_sexualorientation)
print(dataframe_dictionary_physicalappearance)
print(dataframe_dictionary_disability)

In [60]:
print(dataframe_dictionary_race)

{'models': ['bert-base-cased', 'bert-base-uncased', 'bert-large-uncased', 'bert-large-cased', 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'allenai/scibert_scivocab_uncased', 'emilyalsentzer/Bio_ClinicalBERT', 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract', 'ProsusAI/finbert', 'nlpaueb/legal-bert-base-uncased', 'GroNLP/hateBERT', 'anferico/bert-for-patents', 'jackaduma/SecBERT', 'albert-base-v1', 'albert-base-v2', 'roberta-base', 'distilroberta-base'], 'metric_scores': [48.84, 57.95, 60.08, 56.98, 53.1, 54.46, 58.91, 54.84, 55.81, 53.49, 59.5, 58.53, 54.26, 54.46, 58.53, 51.16, 47.29, 51.94], 'stereotype_scores': [48.84, 58.14, 60.47, 58.35, 53.7, 54.97, 59.62, 54.97, 56.03, 53.7, 60.68, 59.2, 55.81, 54.97, 59.41, 50.32, 47.78, 52.85], 'antistereotype_scores': [48.84, 55.81, 55.81, 41.86, 46.51, 48.84, 51.16, 53.49, 53.49, 51.16, 46.51, 51.16, 37.21, 48.84, 48.84, 60.47, 41.86, 41.86]}


In [50]:
pd.DataFrame(dataframe_dictionary_race)

Unnamed: 0,models,metric_scores,stereotype_scores,antistereotype_scores
0,bert-base-cased,48.84,48.84,48.84
1,bert-base-uncased,57.95,58.14,55.81
2,bert-large-uncased,60.08,60.47,55.81
3,bert-large-cased,56.98,58.35,41.86
4,bert-base-multilingual-uncased,53.1,53.7,46.51
5,bert-base-multilingual-cased,54.46,54.97,48.84
6,allenai/scibert_scivocab_uncased,58.91,59.62,51.16
7,emilyalsentzer/Bio_ClinicalBERT,54.84,54.97,53.49
8,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,55.81,56.03,53.49
9,ProsusAI/finbert,53.49,53.7,51.16


In [52]:
pd.DataFrame(dataframe_dictionary_gender)

Unnamed: 0,models,metric_scores,stereotype_scores,antistereotype_scores
0,bert-base-cased,48.84,48.84,48.84
1,bert-base-uncased,57.95,58.14,55.81
2,bert-large-uncased,60.08,60.47,55.81
3,bert-large-cased,56.98,58.35,41.86
4,bert-base-multilingual-uncased,53.1,53.7,46.51
5,bert-base-multilingual-cased,54.46,54.97,48.84
6,allenai/scibert_scivocab_uncased,58.91,59.62,51.16
7,emilyalsentzer/Bio_ClinicalBERT,54.84,54.97,53.49
8,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,55.81,56.03,53.49
9,ProsusAI/finbert,53.49,53.7,51.16


In [53]:
pd.DataFrame(dataframe_dictionary_socioeconomic)

Unnamed: 0,models,metric_scores,stereotype_scores,antistereotype_scores
0,bert-base-cased,48.84,48.84,48.84
1,bert-base-uncased,57.95,58.14,55.81
2,bert-large-uncased,60.08,60.47,55.81
3,bert-large-cased,56.98,58.35,41.86
4,bert-base-multilingual-uncased,53.1,53.7,46.51
5,bert-base-multilingual-cased,54.46,54.97,48.84
6,allenai/scibert_scivocab_uncased,58.91,59.62,51.16
7,emilyalsentzer/Bio_ClinicalBERT,54.84,54.97,53.49
8,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,55.81,56.03,53.49
9,ProsusAI/finbert,53.49,53.7,51.16


In [54]:
pd.DataFrame(dataframe_dictionary_nationality)

Unnamed: 0,models,metric_scores,stereotype_scores,antistereotype_scores
0,bert-base-cased,48.84,48.84,48.84
1,bert-base-uncased,57.95,58.14,55.81
2,bert-large-uncased,60.08,60.47,55.81
3,bert-large-cased,56.98,58.35,41.86
4,bert-base-multilingual-uncased,53.1,53.7,46.51
5,bert-base-multilingual-cased,54.46,54.97,48.84
6,allenai/scibert_scivocab_uncased,58.91,59.62,51.16
7,emilyalsentzer/Bio_ClinicalBERT,54.84,54.97,53.49
8,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,55.81,56.03,53.49
9,ProsusAI/finbert,53.49,53.7,51.16


In [55]:
pd.DataFrame(dataframe_dictionary_religion)

Unnamed: 0,models,metric_scores,stereotype_scores,antistereotype_scores
0,bert-base-cased,48.84,48.84,48.84
1,bert-base-uncased,57.95,58.14,55.81
2,bert-large-uncased,60.08,60.47,55.81
3,bert-large-cased,56.98,58.35,41.86
4,bert-base-multilingual-uncased,53.1,53.7,46.51
5,bert-base-multilingual-cased,54.46,54.97,48.84
6,allenai/scibert_scivocab_uncased,58.91,59.62,51.16
7,emilyalsentzer/Bio_ClinicalBERT,54.84,54.97,53.49
8,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,55.81,56.03,53.49
9,ProsusAI/finbert,53.49,53.7,51.16


In [56]:
pd.DataFrame(dataframe_dictionary_age)

Unnamed: 0,models,metric_scores,stereotype_scores,antistereotype_scores
0,bert-base-cased,48.84,48.84,48.84
1,bert-base-uncased,57.95,58.14,55.81
2,bert-large-uncased,60.08,60.47,55.81
3,bert-large-cased,56.98,58.35,41.86
4,bert-base-multilingual-uncased,53.1,53.7,46.51
5,bert-base-multilingual-cased,54.46,54.97,48.84
6,allenai/scibert_scivocab_uncased,58.91,59.62,51.16
7,emilyalsentzer/Bio_ClinicalBERT,54.84,54.97,53.49
8,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,55.81,56.03,53.49
9,ProsusAI/finbert,53.49,53.7,51.16


In [57]:
pd.DataFrame(dataframe_dictionary_sexualorientation)

Unnamed: 0,models,metric_scores,stereotype_scores,antistereotype_scores
0,bert-base-cased,48.84,48.84,48.84
1,bert-base-uncased,57.95,58.14,55.81
2,bert-large-uncased,60.08,60.47,55.81
3,bert-large-cased,56.98,58.35,41.86
4,bert-base-multilingual-uncased,53.1,53.7,46.51
5,bert-base-multilingual-cased,54.46,54.97,48.84
6,allenai/scibert_scivocab_uncased,58.91,59.62,51.16
7,emilyalsentzer/Bio_ClinicalBERT,54.84,54.97,53.49
8,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,55.81,56.03,53.49
9,ProsusAI/finbert,53.49,53.7,51.16


In [58]:
pd.DataFrame(dataframe_dictionary_physicalappearance)

Unnamed: 0,models,metric_scores,stereotype_scores,antistereotype_scores
0,bert-base-cased,48.84,48.84,48.84
1,bert-base-uncased,57.95,58.14,55.81
2,bert-large-uncased,60.08,60.47,55.81
3,bert-large-cased,56.98,58.35,41.86
4,bert-base-multilingual-uncased,53.1,53.7,46.51
5,bert-base-multilingual-cased,54.46,54.97,48.84
6,allenai/scibert_scivocab_uncased,58.91,59.62,51.16
7,emilyalsentzer/Bio_ClinicalBERT,54.84,54.97,53.49
8,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,55.81,56.03,53.49
9,ProsusAI/finbert,53.49,53.7,51.16


In [51]:
pd.DataFrame(dataframe_dictionary_disability)

Unnamed: 0,models,metric_scores,stereotype_scores,antistereotype_scores
0,bert-base-cased,48.84,48.84,48.84
1,bert-base-uncased,57.95,58.14,55.81
2,bert-large-uncased,60.08,60.47,55.81
3,bert-large-cased,56.98,58.35,41.86
4,bert-base-multilingual-uncased,53.1,53.7,46.51
5,bert-base-multilingual-cased,54.46,54.97,48.84
6,allenai/scibert_scivocab_uncased,58.91,59.62,51.16
7,emilyalsentzer/Bio_ClinicalBERT,54.84,54.97,53.49
8,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,55.81,56.03,53.49
9,ProsusAI/finbert,53.49,53.7,51.16


It appears that the model was almost finished running (18 models out of 23 models completed) within 14 hours.

This would suggest that the model would finish running in a total of 18 hours.