# Requirements

In [1]:
%pip install transformers
%pip install torch
%pip install pickle5
%pip install mpld3
%pip install scikit-learn
%pip install pattern3
%pip install nltk
%pip install pattern

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Load pretrained Bert model and tokenizer

In [6]:
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
%matplotlib inline

debiased_model = "sent_debiased"
folder_num = "2."
model_path = "../debiased_models/sent_debias/debias-BERT/experiments/acl2020-results/QNLI/debiased_final_final"
model = BertModel.from_pretrained(model_path, output_hidden_states = True)

# debiased_model = "contextualised"
# folder_num = "3."
# model_path = "../debiased_models/contextualised-embeddings-bert"
# model = BertModel.from_pretrained(model_path, output_hidden_states = True)

# debiased_model = "cds"
# folder_num = "4."
# model_path = "../debiased_models/cds.pt"
# model = torch.load(model_path, map_location=torch.device('cpu'))

save_path = f"../data/extracted/{folder_num} {debiased_model}/{debiased_model}_sentence_"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


Some weights of the model checkpoint at ../debiased_models/contextualised-embeddings-bert were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
SINGULAR_NOUN_TEMPLATES = (
    'This is {article} {term}.',
    'That is {article} {term}.',
    'There is {article} {term}.',
    'Here is {article} {term}.',
    'The {term} is here.',
    'The {term} is there.',
)

PLURAL_NOUN_TEMPLATES = (
    'These are {term}.',
    'Those are {term}.',
    'They are {term}.',
    'The {term} are here.',
    'The {term} are there.',
)

def fill_template(template, term):
    article = (
        'an'
        if (
            (
                term.startswith('honor') or any(
                    term.startswith(c) for c in 'aeiouAEIOU'
                )
            ) and not (
                term.startswith('European') or term.startswith('Ukrainian')
            )
        )
        else 'a'
    )
    sentence = template.format(article=article, term=term)
    return sentence[0].upper() + sentence[1:]

In [8]:
from pattern.en import pluralize, singularize
import nltk

def generate_noun_sentences(vocab):
    tags = [(word, tag) for w in vocab for word, tag in nltk.pos_tag([w])]
    nouns = [word for word, tag in tqdm(tags) if tag.startswith("N")]
    w2i = {w: i for i, w in enumerate(nouns)}

    sentence_list = []
    for term in tqdm(nouns):
        singular_term = singularize(term)
        sentences = []
        sentences += [fill_template(template, singular_term) for template in SINGULAR_NOUN_TEMPLATES]
        plurar_term = pluralize(term)
        sentences += [fill_template(template, plurar_term) for template in PLURAL_NOUN_TEMPLATES]
        sentence_list.append(sentences)
    return w2i, nouns, sentence_list

# open file and extract bert embedding function

In [9]:
import pickle5 as pickle
from tqdm import tqdm 
import pickle5 as pickle

def dump_pklfile(file, filepath, size):
	with open(filepath, "wb") as f:
		if (size == 0):
			pickle.dump((file), f)
			return
		if (size > 0):
			pickle.dump((file[:size]), f)
			return
		else:
			pickle.dump((file[size:]), f)
			return
    
def open_pklfile(filepath, size):
	with open(filepath, "rb") as f:
		if (size == 0):
			return pickle.load(f)
		return (pickle.load(f))[0:size]

def extract_bert_embeddings(sentence_list):
	#init for stacking embeddings
	embeddings = torch.empty(0, device=device)

	for sentences in tqdm(sentence_list):
		embedding = torch.empty(0, device=device)
		for sentence in sentences:
			# Map the token strings to their vocabulary indeces.
			marked_text = "[CLS] " + sentence + " [SEP]"
			tokenized_text = tokenizer.tokenize(marked_text)
			
			# handling such as "wedding_dress"
			tokenized_text = [token for token in tokenized_text if token != '_']

			# Split the sentence into tokens.
			indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
			segments_ids = [1] * len(tokenized_text)

			# Convert inputs to PyTorch tensors
			tokens_tensor = torch.tensor([indexed_tokens], device=device)
			segments_tensors = torch.tensor([segments_ids], device=device)

			# Put the model in "evaluation" mode,meaning feed-forward operation.
			model.eval()

			# Run the text through BERT, get the output and collect all of the hidden states produced from all 12 layers.
			with torch.no_grad():
				outputs = model(tokens_tensor, segments_tensors).hidden_states
				last_four_hidden_states = outputs[-4:]

				concated_hidden_states = torch.cat(last_four_hidden_states, dim=2)
				concated_hidden_states = torch.squeeze(concated_hidden_states)
				
				sentence_embedding = torch.mean(concated_hidden_states, dim=0)
				sentence_embedding = torch.unsqueeze(sentence_embedding, dim=-1)

			embedding = torch.cat([embedding, sentence_embedding], 1)

		sum_embedding = torch.sum(embedding, 1)
		norm_embedding = torch.nn.functional.normalize(sum_embedding, dim=0)
		norm_embedding = torch.unsqueeze(norm_embedding, dim=-1)
		norm_transposed = torch.transpose(norm_embedding, 0, 1)

		embeddings = torch.cat([embeddings, norm_transposed], 0)

	return embeddings

## Extracting Embeddings: 2016 words + sent_debiased + sentence

In [11]:
vocab = open_pklfile("../data/extracted/0. original/original_word_2016_restricted_vocab.pkl", 0)
w2i, nouns, sentence_list = generate_noun_sentences(vocab)
lim_wv = extract_bert_embeddings(sentence_list)
dump_pklfile(lim_wv, f"{save_path}2016_restricted_embeddings.pkl", 0)
print(save_path)

26189
26189


100%|██████████| 26189/26189 [00:00<00:00, 2667685.73it/s]


18447


100%|██████████| 18447/18447 [00:01<00:00, 10087.75it/s]
100%|██████████| 18447/18447 [22:01<00:00, 13.96it/s]


../data/extracted/3. contextualised/contextualised_sentence_


## Extracting Embeddings: 2018 words + sent_debiased + sentence

In [12]:
vocab = open_pklfile("../data/extracted/0. original/original_word_2018_restricted_vocab.pkl", 0)
w2i, nouns, sentence_list = generate_noun_sentences(vocab)
lim_wv = extract_bert_embeddings(sentence_list)
dump_pklfile(lim_wv, f"{save_path}2018_restricted_embeddings.pkl", 0)
print(save_path)

47698
47698


100%|██████████| 47698/47698 [00:00<00:00, 2625699.37it/s]


39385


100%|██████████| 39385/39385 [00:03<00:00, 10303.99it/s]
 53%|█████▎    | 20768/39385 [25:20<22:49, 13.60it/s]

## Extracting Embeddings: bert words 2500(2016) + sent_debiased + sentence

In [9]:
vocab_male_2016 = open_pklfile("../data/extracted/1. bert/bert_sentence_2016_male_2500_vocab.pkl", 0)
lim_wv_male_2016 = extract_bert_embeddings(vocab_male_2016)
dump_pklfile(lim_wv_male_2016, f"{save_path}2016_male_2500_embeddings.pkl", 0)

vocab_female_2016 = open_pklfile("../data/extracted/1. bert/bert_sentence_2016_female_2500_vocab.pkl", 0)
lim_wv_female_2016 = extract_bert_embeddings(vocab_female_2016)
dump_pklfile(lim_wv_female_2016, f"{save_path}2016_female_2500_embeddings.pkl", 0)

100%|██████████| 2500/2500 [01:55<00:00, 21.64it/s]
100%|██████████| 2500/2500 [02:30<00:00, 16.60it/s]


## Extracting Embeddings: bert words 2500(2018) + sent_debiased + sentence

In [10]:
vocab_male_2018 = open_pklfile("../data/extracted/1. bert/bert_sentence_2018_male_2500_vocab.pkl", 0)
lim_wv_male_2018 = extract_bert_embeddings(vocab_male_2018)
dump_pklfile(lim_wv_male_2018, f"{save_path}2018_male_2500_embeddings.pkl", 0)

vocab_female_2018 = open_pklfile("../data/extracted/1. bert/bert_sentence_2018_female_2500_vocab.pkl", 0)
lim_wv_female_2018 = extract_bert_embeddings(vocab_female_2018)
dump_pklfile(lim_wv_female_2018, f"{save_path}2018_female_2500_embeddings.pkl", 0)

100%|██████████| 2500/2500 [01:45<00:00, 23.72it/s]
100%|██████████| 2500/2500 [01:58<00:00, 21.01it/s]


## Extracting Embeddings: word file words + sent_debiased + sentence

In [11]:
with open("../data/lists/male_sentence_file.txt", 'r') as f:
  male_sentences = [sentence.strip() for sentence in f.readlines()]
male_sentence_embs = extract_bert_embeddings([male_sentences])
dump_pklfile(male_sentence_embs, f"{save_path}male_word_file_embeddings.pkl", 0)

with open("../data/lists/female_sentence_file.txt", 'r') as f:
  female_sentences = [sentence.strip() for sentence in f.readlines()]
female_sentence_embs = extract_bert_embeddings([female_sentences])
dump_pklfile(female_sentence_embs, f"{save_path}female_word_file_embeddings.pkl", 0)

100%|██████████| 1/1 [00:00<00:00,  1.75it/s]
100%|██████████| 1/1 [00:00<00:00,  1.93it/s]
