# Requirements

In [None]:
%pip install transformers
%pip install torch
%pip install pickle5
%pip install mpld3
%pip install scikit-learn
%pip install pattern3
%pip install nltk

In [39]:
import pickle5 as pickle

def dump_pklfile(file, filepath, size):
	with open(filepath, "wb") as f:
		if (size == 0):
			pickle.dump((file), f)
			return
		if (size > 0):
			pickle.dump((file[:size]), f)
			return
		else:
			pickle.dump((file[size:]), f)
			return
    
def open_pklfile(filepath, size):
	with open(filepath, "rb") as f:
		if (size == 0):
			return pickle.load(f)
		return (pickle.load(f))[0:size]

In [40]:
SINGULAR_NOUN_TEMPLATES = (
    'This is {article} {term}.',
    'That is {article} {term}.',
    'There is {article} {term}.',
    'Here is {article} {term}.',
    'The {term} is here.',
    'The {term} is there.',
)

PLURAL_NOUN_TEMPLATES = (
    'These are {term}.',
    'Those are {term}.',
    'They are {term}.',
    'The {term} are here.',
    'The {term} are there.',
)

def fill_template(template, term):
    article = (
        'an'
        if (
            (
                term.startswith('honor') or any(
                    term.startswith(c) for c in 'aeiouAEIOU'
                )
            ) and not (
                term.startswith('European') or term.startswith('Ukrainian')
            )
        )
        else 'a'
    )
    sentence = template.format(article=article, term=term)
    return sentence[0].upper() + sentence[1:]



In [41]:
import nltk
from tqdm import tqdm

from pattern.en import pluralize, singularize

def generate_noun_sentences(vocab):
    tags = [(word, tag) for w in vocab for word, tag in nltk.pos_tag([w])]
    nouns = [word for word, tag in tqdm(tags) if tag.startswith("N")]
    w2i = {w: i for i, w in enumerate(nouns)}

    sentence_list = []
    for term in tqdm(nouns):
        singular_term = singularize(term)
        sentences = []
        sentences += [fill_template(template, singular_term) for template in SINGULAR_NOUN_TEMPLATES]
        plurar_term = pluralize(term)
        sentences += [fill_template(template, plurar_term) for template in PLURAL_NOUN_TEMPLATES]
        sentence_list.append(sentences)
    return w2i, nouns, sentence_list

# Load pretrained Bert model and tokenizer

In [42]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

import matplotlib.pyplot as plt
%matplotlib inline

# path prefix
prefix = "../data/extracted"

# debiased_model = "sent_debiased"
# debiased_folder = "2. sent_debiased/sent_debiased words, sent_debiased embeddings"
# model_path = "../debiased_models/sent_debias/debias-BERT/experiments/acl2020-results/QNLI/debiased_final_final"
# model = BertModel.from_pretrained(model_path, output_hidden_states = True)

# debiased_model = "contextualised"
# debiased_folder = "3. contextualised/contextualised words, contextualised embeddings"
# model_path = "../debiased_models/contextualised-embeddings-bert"
# model = BertModel.from_pretrained(model_path, output_hidden_states = True)

debiased_model = "cds"
debiased_folder = "4. cds/cds words, cds embeddings"
model_path = "../debiased_models/cds.pt"
model = BertForMaskedLM.from_pretrained('bert-base-uncased',
                                        output_attentions = False,
                                        output_hidden_states = True)
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

# word_or_sent = "word"
word_or_sent = "sentence"

save_path = f"{prefix}/{debiased_folder}/{debiased_model}_{word_or_sent}_"

# Load pre-trained model (weights)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# open file and extract bert embedding function

In [43]:
import pickle5 as pickle
from tqdm import tqdm 

def extract_bert_embeddings(sentence_list):
	#init for stacking embeddings
	embeddings = torch.empty(0, device=device)

	for sentences in tqdm(sentence_list):
		embedding=torch.empty(0, device=device)
		for sentence in sentences:
			# Map the token strings to their vocabulary indeces.
			marked_text = "[CLS] " + sentence + " [SEP]"
			tokenized_text = tokenizer.tokenize(marked_text)
			
			# handling such as "wedding_dress"
			tokenized_text = [token for token in tokenized_text if token != '_']

			# Split the sentence into tokens.
			indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
			segments_ids = [1] * len(tokenized_text)

			# Convert inputs to PyTorch tensors
			tokens_tensor = torch.tensor([indexed_tokens], device=device)
			segments_tensors = torch.tensor([segments_ids], device=device)
			
			# Put the model in "evaluation" mode,meaning feed-forward operation.
			model.eval()

			# Run the text through BERT, get the output and collect all of the hidden states produced from all 12 layers.
			with torch.no_grad():
				outputs = model(tokens_tensor, segments_tensors).hidden_states
				last_four_hidden_states = outputs[-4:]

				concated_hidden_states = torch.cat(last_four_hidden_states, dim=2)
				concated_hidden_states = torch.squeeze(concated_hidden_states)
				
				sentence_embedding = torch.mean(concated_hidden_states, dim=0)
				sentence_embedding = torch.unsqueeze(sentence_embedding, dim=-1)

			embedding = torch.cat([embedding, sentence_embedding], 1)

		sum_embedding = torch.sum(embedding, 1)
		norm_embedding = torch.nn.functional.normalize(sum_embedding, dim=0)
		norm_embedding = torch.unsqueeze(norm_embedding, dim=-1)
		norm_transposed = torch.transpose(norm_embedding, 0, 1)
		
		embeddings = torch.cat([embeddings, norm_transposed], 0)
	return embeddings


# computing bert bias function

In [44]:

import numpy as np
from torch import linalg as LA
import scipy.stats
import json 
import codecs

# normalize vectors
def normalize(wv):    
    norms = LA.norm(wv, dim=1)
    wv = wv / norms[:, np.newaxis]
    return wv

# compute bias from bert with he-she
def compute_bias_by_projection_sentence(vocab, lim_wv, gender_word_embedding):
    print(gender_word_embedding[0].shape)
    males = torch.tensordot(lim_wv, gender_word_embedding[0], dims=1)
    females = torch.tensordot(lim_wv, gender_word_embedding[1], dims=1)
    d = {}
    for w, m, f in tqdm(zip(vocab, males, females)):
        d[w] = m - f
    return d

def extract_professions():
    professions = []
    with codecs.open('../data/lists/professions.json', 'r', 'utf-8') as f:
        professions_data = json.load(f)
    for item in professions_data:
        professions.append(item[0].strip())
    return professions

## Extracting Embeddings: 2016 words + bert + sentence

In [45]:
vocab = open_pklfile("../data/extracted/0. original/original_word_2016_restricted_vocab.pkl", 0)
w2i, nouns, sentence_list = generate_noun_sentences(vocab)
lim_wv = extract_bert_embeddings(sentence_list)
dump_pklfile(nouns, f"{save_path}2016_restricted_vocab.pkl", 0)
dump_pklfile(lim_wv, f"{save_path}2016_restricted_embeddings.pkl", 0)
lim_wv = normalize(lim_wv)

100%|██████████| 26189/26189 [00:00<00:00, 2567183.03it/s]
100%|██████████| 18447/18447 [00:01<00:00, 10669.44it/s]
100%|██████████| 18447/18447 [22:40<00:00, 13.56it/s]


In [46]:
with open("../data/lists/male_sentence_file.txt", 'r') as f:
    male_sentences = [sentence.strip() for sentence in f.readlines()]

with open("../data/lists/female_sentence_file.txt", 'r') as f:
    female_sentences = [sentence.strip() for sentence in f.readlines()]

male_embeddings = extract_bert_embeddings([male_sentences])
female_embeddings = extract_bert_embeddings([female_sentences])

male_embeddings = normalize(male_embeddings)
female_embeddings = normalize(female_embeddings)

gender_word_embedding = torch.cat((male_embeddings, female_embeddings), 0)

gender_bias_all = compute_bias_by_projection_sentence(nouns, lim_wv, gender_word_embedding)

100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
100%|██████████| 1/1 [00:00<00:00,  1.86it/s]


torch.Size([3072])


18447it [00:00, 159548.99it/s]


In [47]:
import operator

sorted_g = sorted(gender_bias_all.items(), key=operator.itemgetter(1))
females_w = [item[0] for item in sorted_g[:5000]]
females_e = torch.empty(0, device=device)
for w in females_w:
    temp = torch.unsqueeze(lim_wv[w2i[w]], 0)
    females_e = torch.cat([females_e, temp], dim=0)

dump_pklfile(females_w, f"{save_path}2016_female_5000_vocab.pkl", 5000)
dump_pklfile(females_e, f"{save_path}2016_female_5000_embeddings.pkl", 5000)

sorted_g = sorted(gender_bias_all.items(), key=operator.itemgetter(1), reverse=True)
males_w = [item[0] for item in sorted_g[:5000]]
males_e = torch.empty(0, device=device)
for w in males_w:
    temp = torch.unsqueeze(lim_wv[w2i[w]], 0)
    males_e = torch.cat([males_e, temp], dim=0)

dump_pklfile(males_w, f"{save_path}2016_male_5000_vocab.pkl", 5000)
dump_pklfile(males_e, f"{save_path}2016_male_5000_embeddings.pkl", 5000)

print(save_path)

../data/extracted/4. cds/cds words, cds embeddings/cds_sentence_


## Extracting Embeddings: 2018 words + bert + sentence

In [48]:
vocab = open_pklfile("../data/extracted/0. original/original_word_2018_restricted_vocab.pkl", 0)
w2i, nouns, sentence_list = generate_noun_sentences(vocab)
lim_wv = extract_bert_embeddings(sentence_list)
dump_pklfile(nouns, f"{save_path}2018_restricted_vocab.pkl", 0)
dump_pklfile(lim_wv, f"{save_path}2018_restricted_embeddings.pkl", 0)

lim_wv = normalize(lim_wv)

100%|██████████| 47698/47698 [00:00<00:00, 2649203.65it/s]
100%|██████████| 39385/39385 [00:03<00:00, 10542.40it/s]
100%|██████████| 39385/39385 [49:11<00:00, 13.35it/s]


In [49]:
with open("../data/lists/male_sentence_file.txt", 'r') as f:
  male_sentences = [sentence.strip() for sentence in f.readlines()]

with open("../data/lists/female_sentence_file.txt", 'r') as f:
  female_sentences = [sentence.strip() for sentence in f.readlines()]

male_embeddings = extract_bert_embeddings([male_sentences])
female_embeddings = extract_bert_embeddings([female_sentences])

male_embeddings = normalize(male_embeddings)
female_embeddings = normalize(female_embeddings)

gender_word_embedding = torch.cat((male_embeddings, female_embeddings), 0)

gender_bias_all = compute_bias_by_projection_sentence(nouns, lim_wv, gender_word_embedding)

100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
100%|██████████| 1/1 [00:00<00:00,  1.87it/s]


torch.Size([3072])


39385it [00:00, 167105.52it/s]


In [50]:
import operator

sorted_g = sorted(gender_bias_all.items(), key=operator.itemgetter(1))
females_w = [item[0] for item in sorted_g[:5000]]
females_e = torch.empty(0, device=device)

for w in females_w:
    temp = torch.unsqueeze(lim_wv[w2i[w]], 0)
    females_e = torch.cat([females_e, temp], dim=0)

dump_pklfile(females_w, f"{save_path}2018_female_5000_vocab.pkl", 5000)
dump_pklfile(females_e, f"{save_path}2018_female_5000_embeddings.pkl", 5000)

sorted_g = sorted(gender_bias_all.items(), key=operator.itemgetter(1), reverse=True)
males_w = [item[0] for item in sorted_g[:5000]]
males_e = torch.empty(0, device=device)

for w in males_w:
    temp = torch.unsqueeze(lim_wv[w2i[w]], 0)
    males_e = torch.cat([males_e, temp], dim=0)

dump_pklfile(males_w, f"{save_path}2018_male_5000_vocab.pkl", 5000)
dump_pklfile(males_e, f"{save_path}2018_male_5000_embeddings.pkl", 5000)

### extract word file embeddings

In [51]:
with open("../data/lists/male_sentence_file.txt", 'r') as f:
  male_sentences = [sentence.strip() for sentence in f.readlines()]
male_sentence_embs = extract_bert_embeddings([male_sentences])
dump_pklfile(male_sentence_embs, f"{save_path}male_word_file_embeddings.pkl", 0)

with open("../data/lists/female_sentence_file.txt", 'r') as f:
  female_sentences = [sentence.strip() for sentence in f.readlines()]
female_sentence_embs = extract_bert_embeddings([female_sentences])
dump_pklfile(female_sentence_embs, f"{save_path}female_word_file_embeddings.pkl", 0)

100%|██████████| 1/1 [00:00<00:00,  1.75it/s]
100%|██████████| 1/1 [00:00<00:00,  1.89it/s]
