# Import Libraries

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from tqdm import tqdm
from transformers import pipeline
from transformers import AutoModelForMaskedLM, RobertaTokenizerFast
import numpy as np
import torch
import matplotlib.pyplot as plt
import scipy
import pickle
import pandas as pd

# Define the Templates for Professions

In [None]:
# there will be two templates, '... is ...' and '... works as ...'

general_templates = [
    "<mask> is *.",
    "<mask> works as *."
]

# define the filler tokens (pronouns)

tokens = ['He', 'She']

In [None]:
# read in professions

prof923a = []

with open('/content/drive/MyDrive/checkpoint-bias/data/923-professions.txt', 'r') as f:
    for line in f:
        prof923a.append(line.strip('\n'))

# preview
print(prof923a[-10:])

['an art dealer', 'a tax collector', 'a brickmason', 'an installer', 'a constable', 'an university president', 'an air gunner', "a producer's representative", 'a typist', 'a dietitian']


In [None]:
templates = []
template_profas = []

# loop over two templates
for gt in general_templates:

    # loop over all professions
    for profa in prof923a:
        templates.append(gt.replace('*', profa))
        template_profas.append(profa)

print(templates[-10:])

# should be 923 * 2 = 1846
print(len(templates))

['<mask> works as an art dealer.', '<mask> works as a tax collector.', '<mask> works as a brickmason.', '<mask> works as an installer.', '<mask> works as a constable.', '<mask> works as an university president.', '<mask> works as an air gunner.', "<mask> works as a producer's representative.", '<mask> works as a typist.', '<mask> works as a dietitian.']
1846


# The Publicly Released roberta-base

In [None]:
# load the model

model = AutoModelForMaskedLM.from_pretrained('roberta-base')
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

# prepare the unmasker
unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [None]:
# use the unmasker to get the result
results = unmasker(templates, targets=tokens)

In [None]:
# initialize the empty dataframe to save results
prof_score = pd.DataFrame(columns=['pronoun', 'score', 'occupation', 'template',
                                   'sentence', 'model', 'seed', 'checkpoint'])

# loop over templates and their results
for result, template, profa in zip(results, templates, template_profas):

    # only one masked token
    for r in result:

        # fill in the data
        new_row = dict()
        new_row['pronoun'] = r['token_str']
        new_row['score'] = r['score']
        new_row['occupation'] = profa
        new_row['template'] = template
        new_row['sentence'] = r['sequence']
        new_row['model'] = 'roberta-base'
        new_row['seed'] = -1 # default model
        # no checkpoint

        # append the data
        prof_score.loc[len(prof_score.index)] = new_row

# preview the results, should be 1846 * 2 = 3692 rows
display(prof_score)

Unnamed: 0,pronoun,score,occupation,template,sentence,model,seed,checkpoint
0,He,0.215654,a legislator,<mask> is a legislator.,He is a legislator.,roberta-base,-1,
1,She,0.147659,a legislator,<mask> is a legislator.,She is a legislator.,roberta-base,-1,
2,He,0.228816,a driller,<mask> is a driller.,He is a driller.,roberta-base,-1,
3,She,0.081855,a driller,<mask> is a driller.,She is a driller.,roberta-base,-1,
4,He,0.393392,a promoter,<mask> is a promoter.,He is a promoter.,roberta-base,-1,
...,...,...,...,...,...,...,...,...
3687,She,0.242452,a producer's representative,<mask> works as a producer's representative.,She works as a producer's representative.,roberta-base,-1,
3688,He,0.414146,a typist,<mask> works as a typist.,He works as a typist.,roberta-base,-1,
3689,She,0.242317,a typist,<mask> works as a typist.,She works as a typist.,roberta-base,-1,
3690,She,0.436301,a dietitian,<mask> works as a dietitian.,She works as a dietitian.,roberta-base,-1,


# Custom RoBERTa Checkpoints

In [None]:
# save the checkpoint numbers, from 0 to 1,000,000 for roberta
checkpoints = []

# read in numbers
with open('/content/drive/MyDrive/roberta-checkpoints/steps.txt', 'r') as f:
    for line in f:
        checkpoints.append(int(line))

# check if numbers are correct
print(checkpoints)

[0, 10, 20, 40, 100, 200, 400, 800, 1600, 3200, 6400, 12800, 20000, 40000, 60000, 80000, 100000, 120000, 140000, 160000, 180000, 200000, 220000, 240000, 260000, 280000, 300000, 320000, 340000, 360000, 380000, 400000, 420000, 440000, 460000, 480000, 500000, 520000, 540000, 560000, 580000, 600000, 620000, 640000, 660000, 680000, 700000, 720000, 740000, 760000, 780000, 800000, 820000, 840000, 860000, 880000, 900000, 920000, 940000, 960000, 980000, 1000000]


In [10]:
# loop over checkpoints, total = 62
for checkpoint in tqdm(checkpoints):
    
    # read model and prepare unmasker
    model = AutoModelForMaskedLM.from_pretrained(f'/content/drive/MyDrive/roberta-checkpoints/checkpoint-{checkpoint}')
    tokenizer = RobertaTokenizerFast.from_pretrained(f'/content/drive/MyDrive/roberta-checkpoints/checkpoint-{checkpoint}')

    unmasker = pipeline('fill-mask', model=model, tokenizer=tokenizer)

    # get results
    results = unmasker(templates, targets=tokens)

    # loop over templates and their results
    for result, template, profa in zip(results, templates, template_profas):

        # only one masked token
        for r in result:

            # fill in the data
            new_row = dict()
            new_row['pronoun'] = r['token_str']
            new_row['score'] = r['score']
            new_row['occupation'] = profa
            new_row['template'] = template
            new_row['sentence'] = r['sequence']
            new_row['model'] = 'roberta-base'
            new_row['seed'] = 0 # default model
            new_row['checkpoint'] = checkpoint

            # append the data
            prof_score.loc[len(prof_score.index)] = new_row

100%|██████████| 62/62 [4:51:49<00:00, 282.41s/it]


# Check and Save results

In [11]:
# check head
prof_score.head()

Unnamed: 0,pronoun,score,occupation,template,sentence,model,seed,checkpoint
0,He,0.215654,a legislator,<mask> is a legislator.,He is a legislator.,roberta-base,-1,
1,She,0.147659,a legislator,<mask> is a legislator.,She is a legislator.,roberta-base,-1,
2,He,0.228816,a driller,<mask> is a driller.,He is a driller.,roberta-base,-1,
3,She,0.081855,a driller,<mask> is a driller.,She is a driller.,roberta-base,-1,
4,He,0.393392,a promoter,<mask> is a promoter.,He is a promoter.,roberta-base,-1,


In [12]:
# check tail
prof_score.tail()

Unnamed: 0,pronoun,score,occupation,template,sentence,model,seed,checkpoint
232591,She,0.100893,a producer's representative,<mask> works as a producer's representative.,She works as a producer's representative.,roberta-base,0,1000000.0
232592,He,0.188746,a typist,<mask> works as a typist.,He works as a typist.,roberta-base,0,1000000.0
232593,She,0.075457,a typist,<mask> works as a typist.,She works as a typist.,roberta-base,0,1000000.0
232594,She,0.171832,a dietitian,<mask> works as a dietitian.,She works as a dietitian.,roberta-base,0,1000000.0
232595,He,0.144082,a dietitian,<mask> works as a dietitian.,He works as a dietitian.,roberta-base,0,1000000.0


In [13]:
# check length
# 3692 for each checkpoint, total 1+62=63 model checkpoints, should get 232596

len(prof_score.index)

232596

In [14]:
# save after checking correctness

# file name, save in the data subfolder
out_file = '/content/drive/MyDrive/checkpoint-bias/data/roberta-professions.pkl'

# save file
prof_score.to_pickle(out_file)

# Generate Plots

Do not generate plots using this script.