# Import Libraries

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 6.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 43.5 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 12.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0


In [None]:
from tqdm import tqdm
from transformers import pipeline
from transformers import AutoModelForMaskedLM, BertTokenizerFast
import numpy as np
import torch
import matplotlib.pyplot as plt
import pickle
import pandas as pd

# Define the Templates for Professions

In [None]:
# there will be two templates, '... is ...' and '... works as ...'
# remember to use different mask tokens for BERT and RoBERTa

general_templates = [
    "[MASK] is *.",
    "[MASK] works as *."
]

# define the filler tokens (pronouns)
# for the uncased model, use lower case

tokens = ['he', 'she']

In [None]:
# read in professions

prof923a = []

with open('/content/drive/MyDrive/checkpoint-bias/data/923-professions.txt', 'r') as f:
    for line in f:
        prof923a.append(line.strip('\n'))

# preview
print(prof923a[-10:])

['an art dealer', 'a tax collector', 'a brickmason', 'an installer', 'a constable', 'an university president', 'an air gunner', "a producer's representative", 'a typist', 'a dietitian']


In [None]:
templates = []
template_profas = []

# loop over two templates
for gt in general_templates:

    # loop over all professions
    for profa in prof923a:
        templates.append(gt.replace('*', profa))
        template_profas.append(profa)

print(templates[-10:])

# should be 923 * 2 = 1846
print(len(templates))

['[MASK] works as an art dealer.', '[MASK] works as a tax collector.', '[MASK] works as a brickmason.', '[MASK] works as an installer.', '[MASK] works as a constable.', '[MASK] works as an university president.', '[MASK] works as an air gunner.', "[MASK] works as a producer's representative.", '[MASK] works as a typist.', '[MASK] works as a dietitian.']
1846


# The Publicly Released bert-base-uncased

In [None]:
# load the model

model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# prepare the unmasker
unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# use the unmasker to get the result
results = unmasker(templates, targets=tokens)

In [None]:
# initialize the empty dataframe to save results
prof_score = pd.DataFrame(columns=['pronoun', 'score', 'occupation', 'template',
                                   'sentence', 'model', 'seed', 'checkpoint'])

# loop over templates and their results
for result, template, profa in zip(results, templates, template_profas):

    # only one masked token
    for r in result:

        # fill in the data
        new_row = dict()
        new_row['pronoun'] = r['token_str']
        new_row['score'] = r['score']
        new_row['occupation'] = profa
        new_row['template'] = template
        new_row['sentence'] = r['sequence']
        new_row['model'] = 'bert-base-uncased'
        new_row['seed'] = -1 # default model
        # no checkpoint

        # append the data
        prof_score.loc[len(prof_score.index)] = new_row

# preview the results, should be 1846 * 2 = 3692 rows
display(prof_score)

Unnamed: 0,pronoun,score,occupation,template,sentence,model,seed,checkpoint
0,he,0.875311,a legislator,[MASK] is a legislator.,he is a legislator.,bert-base-uncased,-1,
1,she,0.093801,a legislator,[MASK] is a legislator.,she is a legislator.,bert-base-uncased,-1,
2,he,0.754409,a driller,[MASK] is a driller.,he is a driller.,bert-base-uncased,-1,
3,she,0.037162,a driller,[MASK] is a driller.,she is a driller.,bert-base-uncased,-1,
4,he,0.431155,a promoter,[MASK] is a promoter.,he is a promoter.,bert-base-uncased,-1,
...,...,...,...,...,...,...,...,...
3687,she,0.442183,a producer's representative,[MASK] works as a producer's representative.,she works as a producer's representative.,bert-base-uncased,-1,
3688,he,0.706501,a typist,[MASK] works as a typist.,he works as a typist.,bert-base-uncased,-1,
3689,she,0.220935,a typist,[MASK] works as a typist.,she works as a typist.,bert-base-uncased,-1,
3690,he,0.816707,a dietitian,[MASK] works as a dietitian.,he works as a dietitian.,bert-base-uncased,-1,


# Custom BERT Checkpoints

***Remember to change the seed!***

In [None]:
# set seed, in [0, 1, 2, 3, 4] for BERT
seed = 4
base_dir = f'/content/drive/MyDrive/bert-checkpoints/pytorch_checkpoints/seed_{seed}'

In [None]:
# save the checkpoint numbers, from 0 to 2,000,000 for bert
checkpoints = []

# read in numbers
with open('/content/drive/MyDrive/bert-checkpoints/steps.txt', 'r') as f:
    for line in f:
        checkpoints.append(int(line))

# check if numbers are correct
print(checkpoints)

[0, 20000, 40000, 60000, 80000, 100000, 120000, 140000, 160000, 180000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1100000, 1200000, 1300000, 1400000, 1500000, 1600000, 1700000, 1800000, 1900000, 2000000]


In [11]:
# loop over checkpoints, total = 29
for checkpoint in tqdm(checkpoints):
    
    # read model and prepare unmasker
    model = AutoModelForMaskedLM.from_pretrained(f'{base_dir}/step_{checkpoint}')
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)

    # get results
    results = unmasker(templates, targets=tokens)

    # loop over templates and their results
    for result, template, profa in zip(results, templates, template_profas):

        # only one masked token
        for r in result:

            # fill in the data
            new_row = dict()
            new_row['pronoun'] = r['token_str']
            new_row['score'] = r['score']
            new_row['occupation'] = profa
            new_row['template'] = template
            new_row['sentence'] = r['sequence']
            new_row['model'] = 'bert-base-uncased'
            new_row['seed'] = seed # the model with the custom seed
            new_row['checkpoint'] = checkpoint

            # append the data
            prof_score.loc[len(prof_score.index)] = new_row

  0%|          | 0/29 [00:00<?, ?it/s]Some weights of the model checkpoint at /content/drive/MyDrive/bert-checkpoints/pytorch_checkpoints/seed_4/step_0 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  3%|▎         | 1/29 [03:27<1:36:57, 207.77s/it]Some weights of the model checkpoint at /content/drive/MyDrive/bert-checkpoints/pytorch_checkpoints/seed_4/step_20000 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bi

# Check and Save results

In [12]:
# check head
prof_score.head()

Unnamed: 0,pronoun,score,occupation,template,sentence,model,seed,checkpoint
0,he,0.875311,a legislator,[MASK] is a legislator.,he is a legislator.,bert-base-uncased,-1,
1,she,0.093801,a legislator,[MASK] is a legislator.,she is a legislator.,bert-base-uncased,-1,
2,he,0.754409,a driller,[MASK] is a driller.,he is a driller.,bert-base-uncased,-1,
3,she,0.037162,a driller,[MASK] is a driller.,she is a driller.,bert-base-uncased,-1,
4,he,0.431155,a promoter,[MASK] is a promoter.,he is a promoter.,bert-base-uncased,-1,


In [13]:
# check tail
prof_score.tail()

Unnamed: 0,pronoun,score,occupation,template,sentence,model,seed,checkpoint
110755,he,0.422669,a producer's representative,[MASK] works as a producer's representative.,he works as a producer's representative.,bert-base-uncased,4,2000000.0
110756,he,0.602213,a typist,[MASK] works as a typist.,he works as a typist.,bert-base-uncased,4,2000000.0
110757,she,0.288169,a typist,[MASK] works as a typist.,she works as a typist.,bert-base-uncased,4,2000000.0
110758,she,0.497801,a dietitian,[MASK] works as a dietitian.,she works as a dietitian.,bert-base-uncased,4,2000000.0
110759,he,0.41194,a dietitian,[MASK] works as a dietitian.,he works as a dietitian.,bert-base-uncased,4,2000000.0


In [14]:
# check length
# 3692 for each checkpoint, total 1+29=30 model checkpoints, should get 110760

len(prof_score.index)

110760

In [15]:
# save after checking correctness

# file name, save in the data subfolder, use different files for different seeds
out_file = f'/content/drive/MyDrive/checkpoint-bias/data/bert-professions-seed-{seed}.pkl'

# save file
prof_score.to_pickle(out_file)

# Generate Plots

Do not generate plots using this script.