### The following are sample codes, which tells how the various model values were obtained. They were re-run multiple times over different parts of our evaluation dataset to reach the final results. More information on how that was done can be found in the Thesis paper.

In [None]:
# Word Embeddings

import torch
from transformers import BertModel, BertTokenizer

model_name = 'bert-large-cased'
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Input word
word = "Brahmin"

# Tokenize the input word
tokens = tokenizer.tokenize(word)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(input_ids).unsqueeze(0)

# Forward pass through the model
outputs = model(input_ids)

# Extract the word embedding from the last layer
last_layer_embedding = outputs.last_hidden_state.squeeze(0)

last_layer_embedding = last_layer_embedding.detach().numpy()

dic[word] = last_layer_embedding

# Print the word embedding
print(f"Word: {word}")
print(f"Embedding: {last_layer_embedding}")

#Find and store Cosine similarity

for word in wordsBrahmin:
  # Assuming you have two NumPy arrays as word embeddings with different dimensions
    embedding1_np = dic['Dalit']
    embedding2_np = dic[word]

# Determine the maximum number of dimensions between the two embeddings
    max_dim = max(embedding1_np.shape[1], embedding2_np.shape[1])

    embedding1_np = embedding1_np.reshape(1,-1)
    embedding2_np = embedding2_np.reshape(1,-1)

# Determine the maximum number of dimensions between the two embeddings
    max_dim = max(embedding1_np.shape[1], embedding2_np.shape[1])

# Pad the embeddings to have the same number of dimensions
    embedding1_padded = np.pad(embedding1_np, ((0, 0), (0, max_dim - embedding1_np.shape[1])), constant_values=0)
    embedding2_padded = np.pad(embedding2_np, ((0, 0), (0, max_dim - embedding2_np.shape[1])), constant_values=0)

    cosine_similarity = np.dot(embedding1_padded, embedding2_padded.T)/(np.linalg.norm(embedding1_padded) * np.linalg.norm(embedding2_padded))
    cosineDalit[word] = cosine_similarity

In [None]:
#AUL Score

import json
import argparse
import torch
import difflib
import nltk
import regex as re
import numpy as np
import pickle

from tqdm import tqdm
from collections import defaultdict
from transformers import AutoModelForMaskedLM, AutoTokenizer, BertTokenizer, BertForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForMaskedLM.from_pretrained('xlm-roberta-large',output_hidden_states=True, output_attentions=True)

model = model.eval()
if torch.cuda.is_available():
    model.to('cuda')
    
mask_id = tokenizer.mask_token_id
log_softmax = torch.nn.LogSoftmax(dim=1)

stereo_inputs = [i for i in df['Stereotypical']]
antistereo_inputs = [i for i in df['Anti-Stereotypical']]

stereo_scores = []
antistereo_scores = []
stereo_embes = []
antistereo_embes = []

def calculate_aul(model, sentence, log_softmax, attention):
    '''
    Given token ids of a sequence, return the averaged log probability of
    unmasked sequence (AULA or AUL).
    '''
    tokens = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)

    # Get the token IDs and attention mask
    input_ids = tokens['input_ids']
    output = model(input_ids)
    logits = output.logits.squeeze(0)
    log_probs = log_softmax(logits)
    input_ids = input_ids.view(-1, 1).detach()
    token_log_probs = log_probs.gather(1, input_ids)[1:-1]
    if attention:
        attentions = torch.mean(torch.cat(output.attentions, 0), 0)
        averaged_attentions = torch.mean(attentions, 0)
        averaged_token_attentions = torch.mean(averaged_attentions, 0)
        token_log_probs = token_log_probs.squeeze(1) * averaged_token_attentions[1:-1]
    sentence_log_prob = torch.mean(token_log_probs)
    score = sentence_log_prob.item()

    hidden_states = output.hidden_states[-1][:,1:-1]
    hidden_state = torch.mean(hidden_states, 1).detach().numpy()

    return score, hidden_state

attention = False

for i in stereo_inputs:
    stereo_score, stereo_hidden_state = calculate_aul(model, i, log_softmax, attention)
    stereo_scores.append(stereo_score)
    stereo_embes.append(stereo_hidden_state)

for j in antistereo_inputs:
    antistereo_score, antistereo_hidden_state = calculate_aul(model, j, log_softmax, attention)
    antistereo_scores.append(antistereo_score)
    antistereo_embes.append(antistereo_hidden_state)

stereo_scores = np.array(stereo_scores)
stereo_scores = stereo_scores.reshape([-1, 1])
antistereo_scores = np.array(antistereo_scores)
antistereo_scores = antistereo_scores.reshape([1, -1])
bias_scores = stereo_scores > antistereo_scores

diff = []
for i in range(len(df)):
    dif = stereo_scores[i][0] - antistereo_scores[0][i]
    diff.append(dif)

score = len(df[df['diff']>=0])/len(df)

In [None]:
#CLL Score 

import numpy as np
import pandas as pd
import torch
import accelerate
import bitsandbytes
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from huggingface_hub.hf_api import HfFolder

model_path="meta-llama/Llama-2-13b-hf"

config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path)

tokenizer = AutoTokenizer.from_pretrained(model_path)

## Same loop was repeated for anti-stereotypical, and target words (to subtract likelihood of target words to
## measure CONDITIONAL likelihoods)

# Tokenize and convert the input sentence to a tensor
for i in df['Stereotypical']:
    sentence = i
    input_ids = tokenizer.encode(sentence, return_tensors='pt')

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        logits = outputs.logits

# Calculate the negative log likelihood for each token
    neg_log_likelihood = torch.nn.CrossEntropyLoss(reduction='none')(logits.view(-1, logits.size(-1)), input_ids.view(-1))

# Reshape the neg_log_likelihood tensor to match the original input shape
    neg_log_likelihood = neg_log_likelihood.view(input_ids.size())

# Output the negative log likelihood for each token
    sent = 0
    for i in range(input_ids.size(1)):
        token = tokenizer.decode(input_ids[0, i])
        nll_token = -neg_log_likelihood[0, i]  # Negate the value
        sent += nll_token
    stereo_score.append(sent.item())

In [None]:
## GPT 3.5 Results

import pandas as pd
import openai
import time

openai.api_key = 'KEY HERE'

def get_completion(prompt, model='gpt-3.5-turbo-0613'): #"gpt-3.5-turbo-0613"
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        temperature=0.7,
        messages=[{"role": "user", "content": prompt}])

    return response.choices[0].message.content

lis =[]
for j in range(len(df)):
    print(j)
    prompt = df['GPT_Prompt_Full'][j]
    try:
        response = get_completion(prompt)
        lis.append(response)
    except Exception as e:
        print(e)
        time.sleep(60)
        response = get_completion(prompt)
        lis.append(response)