# 02 Getting Embeddings

The script was executed individually for all Harvard action verb situations and the work situation by specifying the parameters "context" and "context_list" accordingly. 

## Imports

In [None]:
import bertFuncs as func
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from transformers import logging
logging.set_verbosity_error()
from tqdm.notebook import tqdm
import numpy as np
from scipy.stats import ttest_ind
import math
import pathlib
from functools import partial
from itertools import repeat
from multiprocessing import Pool, freeze_support
import multiprocess as mp
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import random

In [2]:
# Import company list. 
company_names = pd.read_csv("sp500companynames_clean.csv", sep = ";", index_col = 0)
company_names.dropna(inplace = True)
company_names.rename(columns = {"companyname":"Name"}, inplace = True)
company_names["Name"] = company_names["Name"].str.lower()

# Load and clean list of english nouns and generate random sample.
nouns = pd.read_csv('nouns.csv', sep=";")
nouns = nouns[nouns['nouns'].str.strip().str.split(' ').str.len().eq(1)] # Only considering single word nouns.
nouns_sample = nouns["nouns"].sample(n = 800, random_state = 42)
nouns_sample = list(nouns_sample)

# Load action verbs.
action_verbs = pd.read_csv("action_verbs_final.csv", sep = ";", index_col = 0)

# Import and prepare dictionaries. 
dictionaries = pd.read_csv("dimensions_matched_manual.csv", sep = ";", index_col = 0)
dictionaries = dictionaries[dictionaries.Dictionary != "Politics"].reset_index()
dictionaries["tuple"] = list(zip(dictionaries["Term2"].astype(str), dictionaries["Term1"].astype(str)))
dictionaries = dictionaries[["Dictionary", "tuple"]]
dictionaries["tuple"] = dictionaries["tuple"].astype(str)

unique_dictionaries = dictionaries.drop_duplicates(subset='tuple', keep="last")
unique_dictionaries.reset_index(inplace = True)

In [13]:
# Set variables
context = "Work" # Situation set name 
method = f"Embedd_Firms/{context}_context"
companies = company_names["Name"]
num_of_dim = 89
all_dimensions = dictionaries["tuple"].unique()
context_list = ["works", "worked", "will work", "has worked", "is working","is employed", 
                "was employed", "will be employed", "has been employed"] # Situation set verbs 

## 1. Create Pronoun Embeddings

In [8]:
# Get BERT. 
tokenizer, model = func.getBert()

In [9]:
def get_pronoun_embedding(context, embedded_word, numPolar, method): 
    
    # Create dataframe for final embedding. 
    embedding = pd.DataFrame()
    
    # Create path for storing the embedding files. 
    pathlib.Path(f'./01_Embeddings/{method}/').mkdir(parents=True, exist_ok=True) 

    # Define full context. 
    male_context = f"He {context} at the corporation {embedded_word}"
    female_context = f"She {context} at the corporation {embedded_word}"
    
    # Get embeddings for male context.
    male_embedding = pd.DataFrame(func.analyzeWord("He", male_context ,numberPolar = numPolar, model=model, tokenizer=tokenizer)).transpose()
    male_embedding = male_embedding.rename(columns={2: 'Value_Male'})
    male_embedding.drop([0, 1], axis = 1, inplace = True)

    # Get embeddings for female context.   
    female_embedding = pd.DataFrame(func.analyzeWord("She", female_context ,numberPolar = numPolar, model=model, tokenizer=tokenizer)).transpose()
    female_embedding = female_embedding.rename(columns={2: 'Value_Female'})

    # Merge embeddings. 
    df_merged = pd.merge(male_embedding, female_embedding, left_index=True, right_index=True)
    df_merged["female-male"] = df_merged["Value_Female"] - df_merged["Value_Male"]
    df_merged["tuple"] = list(zip(df_merged[0], df_merged[1]))
    
    # Store for faster import later. 
    df_merged.sort_index(inplace=True)
    df_merged.to_csv(f"./01_Embeddings/{method}/{embedded_word}_{context}.csv")

In [10]:
# Create embeddings for random nouns. 
# Parallelized for included situation verbs.
for noun in tqdm(nouns_sample): 
    mp.Pool().starmap(get_pronoun_embedding, zip(context_list, repeat(noun), repeat(num_of_dim), repeat(method)))

  0%|          | 0/800 [00:00<?, ?it/s]

In [6]:
# Create embeddings for all companies and contexts. 
# Parallelized for included situation verbs.
for company in tqdm(companies): 
    mp.Pool().starmap(get_pronoun_embedding, zip(context_list, repeat(company), repeat(num_of_dim), repeat(method)))

  0%|          | 0/782 [00:00<?, ?it/s]

In [24]:
# Calculate average embeddings over context list for both companies and nouns. 

for company in tqdm(itertools.chain(companies, nouns_sample)):
    
    result = pd.DataFrame()
    final_result = pd.DataFrame()

    for context in context_list:
        file = pd.read_csv(f"./01_Embeddings/{method}/{company}_{context}.csv")
        file = file.iloc[: , 1:]
        result = pd.concat([result, file], axis=1, join="outer")

        # Embedding for individual verb was deleted to reduce number of files. 
        # Exception was made for work situation where the correlation between the different verbs was analyzed. 
        os.remove(f"./01_Embeddings/{method}/{company}_{context}.csv")  

    final_result["tuple"] = file["tuple"]
    final_result["context_average_male"] = result.loc[:, result.columns.str.startswith('Value_Male')].mean(axis=1).round(7)
    final_result["context_average_female"] = result.loc[:, result.columns.str.startswith('Value_Female')].mean(axis=1).round(7)
    final_result["average_female-male"] = final_result["context_average_female"] - final_result["context_average_male"]
    final_result.to_csv(f"./01_Embeddings/{method}/{company}_context_average.csv")

  0%|          | 0/800 [00:00<?, ?it/s]

In [4]:
# Define helper functions. 

def read_embedding_values(company_names, antonym_pair, method): 
    
    result = pd.DataFrame()
    result["company"] = company_names
    result["value_male"] = 0
    result["value_female"] = 0
    result["value_neutral"] = 0 
    result["difference"] = 0

    for company in company_names: 
        comparison = pd.read_csv(f"./01_Embeddings/{method}/{company}_context_average.csv")     
        value_male = float(comparison[comparison["tuple"] == antonym_pair]["context_average_male"])
        value_female = float(comparison[comparison["tuple"] == antonym_pair]["context_average_female"])
        difference = value_female - value_male
        result["value_male"].loc[result["company"] == company] = value_male
        result["value_female"].loc[result["company"] == company] = value_female
        result["difference"].loc[result["company"] == company] = difference
    
    return result   

def check_significance(p_value): 
    if p_value < 0.05: 
        stat_res = "Significant Difference"
    else: 
        stat_res = "No Significant Difference"
        
    return stat_res

def round_decimals_up(number:float, decimals:int=2):
    """
    Returns a value rounded up to a specific number of decimal places.
    """
    if not isinstance(decimals, int):
        raise TypeError("decimal places must be an integer")
    elif decimals < 0:
        raise ValueError("decimal places has to be 0 or more")
    elif decimals == 0:
        return math.ceil(number)

    factor = 10 ** decimals
    return math.ceil(number * factor) / factor

In [14]:
# Get dimension significance scores. 

res_table = pd.DataFrame()
res_table["Dimension"] = all_dimensions
res_table["P-Value M vs. F Firm Embedding"] = 0 
res_table["P-Value Delta Firm vs. Delta Random Embedding"] = 0
res_table["Mean Firm"] = 0
res_table["Mean Random"] = 0
res_table["Mean Male"] = 0
res_table["Mean Female"] = 0

# Loop through all unique dimensions. 
for dimension in tqdm(all_dimensions): 

    # Get and append p-values for dimension. 
    result = read_embedding_values(companies, dimension, method)
    rand_result = read_embedding_values(nouns_sample, dimension, method)
    
    stat1, p_value1 = ttest_ind(result["value_male"], result["value_female"])
    stat2, p_value2 = ttest_ind(result["difference"], rand_result["difference"])
    
    res1 = round(p_value1, 5)
    res2 = round(p_value2, 5)
    
    res_table["P-Value M vs. F Firm Embedding"].loc[res_table["Dimension"] == dimension] = round_decimals_up(res1, 5)
    res_table["P-Value Delta Firm vs. Delta Random Embedding"].loc[res_table["Dimension"] == dimension] = round_decimals_up(res2, 5)

    # Get mean bias values. 
    res_table["Mean Male"].loc[res_table["Dimension"] == dimension] = result["value_male"].mean()
    res_table["Mean Female"].loc[res_table["Dimension"] == dimension] = result["value_female"].mean()
    res_table["Mean Firm"].loc[res_table["Dimension"] == dimension] = result["difference"].mean()
    res_table["Mean Random"].loc[res_table["Dimension"] == dimension] = rand_result["difference"].mean()
    res_table["Mean Firm vs. Mean Random"] = res_table["Mean Firm"] - res_table["Mean Random"]
    
res_table["Dictionary"] = unique_dictionaries["Dictionary"]

  0%|          | 0/89 [00:00<?, ?it/s]

In [18]:
# Save significance table for later analysis. 
res_table.to_csv(f"./01_Embeddings/{method}/{situation}_Significance.csv")

## 1.1 Get Pronoun Embeddings for Names. 

The following imports and function was used to generade the pronoun embeddings with real names. The subsequent processing steps were identical to the steps in 1 (calculating average embedding, etc.)

In [15]:
names = pd.read_csv("name_gender_dataset.csv")
male_names = list(names[names["Gender"] == "M"].head(1000).reset_index(drop = True)["Name"])
female_names = list(names[names["Gender"] == "F"].head(1000).reset_index(drop = True)["Name"])

In [6]:
def get_pronoun_name_embedding(context, embedded_word, numPolar, method): 
    
    # Create dataframe for final embedding. 
    embedding = pd.DataFrame()
    
    # Create path for storing the embedding files. 
    pathlib.Path(f'./{method}/').mkdir(parents=True, exist_ok=True) 

    # Define full context. 
    male_name = random.choice(male_names)
    female_name = random.choice(female_names)
    
    male_context = f"{male_name} {context} at the corporation {embedded_word}"
    female_context = f"{female_name} {context} at the corporation {embedded_word}"
    
    # Get embeddings for male context.
    male_embedding = pd.DataFrame(func.analyzeWord(male_name, male_context ,numberPolar = numPolar, model=model, tokenizer=tokenizer)).transpose()
    male_embedding = male_embedding.rename(columns={2: 'Value_Male'})
    male_embedding.drop([0, 1], axis = 1, inplace = True)

    # Get embeddings for female context.   
    female_embedding = pd.DataFrame(func.analyzeWord(female_name, female_context ,numberPolar = numPolar, model=model, tokenizer=tokenizer)).transpose()
    female_embedding = female_embedding.rename(columns={2: 'Value_Female'})
  
    df_merged = pd.merge(male_embedding, female_embedding, left_index=True, right_index=True)  
    df_merged["female-male"] = df_merged["Value_Female"] - df_merged["Value_Male"]
    
    df_merged["tuple"] = list(zip(df_merged[0], df_merged[1]))
    
    # Store for faster import later. 
    df_merged.sort_index(inplace=True)
    df_merged.to_csv(f"./01_Embeddings/{method}/{embedded_word}_{context}.csv")

## 2. Get Firm Embeddings

In [4]:
def get_firm_embedding(context, embedded_word, numPolar, method): 
    
    # Create dataframe for final embedding. 
    embedding = pd.DataFrame()
    
    # Create path for storing the embedding files. 
    pathlib.Path(f'./01_Embeddings/{method}/').mkdir(parents=True, exist_ok=True) 
    
    # Split multi-word names to get average embedding. 
    all_words = embedded_word.split(" ")
    
    # Define full context. 
    male_context = f"He {context} at the corporation {embedded_word}"
    female_context = f"She {context} at the corporation {embedded_word}"
    
    for word in all_words: 

        # Get embeddings for male and female context.
        male_embedding = pd.DataFrame(func.analyzeWord(word, male_context ,numberPolar = numPolar, model=model, tokenizer=tokenizer)).transpose()
        male_embedding = male_embedding.rename(columns={2: 'Value_Male'})
        male_embedding.drop([0, 1], axis = 1, inplace = True)

        female_embedding = pd.DataFrame(func.analyzeWord(word, female_context ,numberPolar = numPolar, model=model, tokenizer=tokenizer)).transpose()
        female_embedding = female_embedding.rename(columns={2: 'Value_Female'})

        # Merge embeddings. 
        df_merged = pd.merge(male_embedding, female_embedding, left_index=True, right_index=True)
        df_merged["female-male"] = df_merged["Value_Female"] - df_merged["Value_Male"]
        df_merged["tuple"] = list(zip(df_merged[0], df_merged[1]))
    
        embedding[[f"{word}_male", f"{word}_female", "tuple"]] = df_merged[["Value_Male", "Value_Female", "tuple"]]
        
    # Take average of different name parts. 
    embedding["average_male"] = embedding.loc[:, embedding.columns.str.endswith('_male')].mean(axis=1).round(7)
    embedding["average_female"] = embedding.loc[:, embedding.columns.str.endswith('_female')].mean(axis=1).round(7)

    # Calculate delta / bias. 
    embedding["female-male"] = (embedding["average_female"] - embedding["average_male"]).round(7)

    # Store for faster import later. 
    embedding.sort_index(inplace=True)
    embedding.to_csv(f"./01_Embeddings/{method}/{embedded_word}_{context}.csv")

#### Create Embeddings for Companies and Nouns

In [5]:
# Get BERT. 
tokenizer, model = func.getBert()

In [None]:
# Create company embeddings. 
# Parallelized for included situation verbs. 
for company in tqdm(companies): 
    mp.Pool().starmap(get_firm_embedding, zip(context_list, repeat(company), repeat(num_of_dim), repeat(method)))

  0%|          | 0/782 [00:00<?, ?it/s]

In [None]:
# Create noun embeddings. 
# Parallelized for included situation verbs. 
for noun in tqdm(nouns_sample): 
    mp.Pool().starmap(get_embeget_firm_embeddingdding_method_1, zip(context_list, repeat(noun), repeat(num_of_dim), repeat(method)))

  0%|          | 0/800 [00:00<?, ?it/s]

In [None]:
# Calculate average embeddings over context list for both companies and nouns. 

for company in tqdm(itertools.chain(companies, nouns_sample)):
    result = pd.DataFrame()
    final_result = pd.DataFrame()
    
    for context in context_list:
        file = pd.read_csv(f"./01_Embeddings/{method}/{company}_{context}.csv")
        file = file.iloc[: , 1:]
        result = pd.concat([result, file], axis=1, join="outer")
        os.remove(f"./01_Embeddings/{method}/{company}_{context}.csv") 
        
    final_result["tuple"] = file["tuple"]
    final_result["context_average_male"] = result.loc[:, result.columns.str.startswith('average_male')].mean(axis=1).round(7)
    final_result["context_average_female"] = result.loc[:, result.columns.str.startswith('average_female')].mean(axis=1).round(7)
    final_result["average_female-male"] = final_result["context_average_female"] - final_result["context_average_male"]
    final_result.to_csv(f"./01_Embeddings/{method}/{company}_context_average.csv")

In [38]:
# Define helper functions. 

def read_embedding_values(company_names, antonym_pair, method): 
    
    result = pd.DataFrame()
    result["company"] = company_names
    result["value_male"] = 0
    result["value_female"] = 0
    result["value_neutral"] = 0 
    result["difference"] = 0

    for company in company_names: 
        comparison = pd.read_csv(f"./01_Embeddings/{method}/{company}_context_average.csv")

        comparison = comparison.groupby("tuple", as_index = False).mean() # Account for duplicates.         
        value_male = float(comparison[comparison["tuple"] == antonym_pair]["context_average_male"])
        value_female = float(comparison[comparison["tuple"] == antonym_pair]["context_average_female"])
        #value_neutral = float(comparison[comparison["tuple"] == antonym_pair]["context_average_neutral"])
        
        difference = value_female - value_male
        result["value_male"].loc[result["company"] == company] = value_male
        result["value_female"].loc[result["company"] == company] = value_female
        #result["value_neutral"].loc[result["company"] == company] = value_neutral
        result["difference"].loc[result["company"] == company] = difference
    
    return result   

def check_significance(p_value): 
    if p_value < 0.05: 
        stat_res = "Significant Difference"
    else: 
        stat_res = "No Significant Difference"
        
    return stat_res

def round_decimals_up(number:float, decimals:int=2):
    """
    Returns a value rounded up to a specific number of decimal places.
    """
    if not isinstance(decimals, int):
        raise TypeError("decimal places must be an integer")
    elif decimals < 0:
        raise ValueError("decimal places has to be 0 or more")
    elif decimals == 0:
        return math.ceil(number)

    factor = 10 ** decimals
    return math.ceil(number * factor) / factor

In [41]:
res_table = pd.DataFrame()
res_table["Dimension"] = all_dimensions
res_table["P-Value M vs. F Firm Embedding"] = 0 
res_table["P-Value Delta Firm vs. Delta Random Embedding"] = 0
res_table["Mean Firm"] = 0
res_table["Mean Random"] = 0

# Loop through all unique dimensions. 
for dimension in tqdm(all_dimensions): 

    # Get and append p-values for dimension. 
    result = read_embedding_values(companies, dimension, method)
    rand_result = read_embedding_values(nouns_sample, dimension, method)
    
    stat1, p_value1 = ttest_ind(result["value_male"], result["value_female"])
    stat2, p_value2 = ttest_ind(result["difference"], rand_result["difference"])
    
    res1 = round(p_value1, 5)
    res2 = round(p_value2, 5)
    
    res_table["P-Value M vs. F Firm Embedding"].loc[res_table["Dimension"] == dimension] = round_decimals_up(res1, 5)
    res_table["P-Value Delta Firm vs. Delta Random Embedding"].loc[res_table["Dimension"] == dimension] = round_decimals_up(res2, 5)

    # Get mean bias values. 
    res_table["Mean Firm"].loc[res_table["Dimension"] == dimension] = result["difference"].mean()
    res_table["Mean Random"].loc[res_table["Dimension"] == dimension] = rand_result["difference"].mean()
    res_table["Mean Firm vs. Mean Random"] = res_table["Mean Firm"] - res_table["Mean Random"]
    
res_table["Dictionary"] = unique_dictionaries["Dictionary"]

  0%|          | 0/89 [00:00<?, ?it/s]

In [42]:
res_table.to_csv(f"./01_Embeddings/{method}/Communication_Significance.csv")

## 2.1 Get Firm Name Embeddings

The following imports and function was used to generade the firm embeddings with real names. The subsequent processing steps were identical to the steps in 2 (calculating average embedding, etc.)

In [15]:
names = pd.read_csv("name_gender_dataset.csv")
male_names = list(names[names["Gender"] == "M"].head(1000).reset_index(drop = True)["Name"])
female_names = list(names[names["Gender"] == "F"].head(1000).reset_index(drop = True)["Name"])

In [6]:
def get_firm_name_embedding(context, embedded_word, numPolar, method): 
    
    # Create dataframe for final embedding. 
    embedding = pd.DataFrame()
    
    # Create path for storing the embedding files. 
    pathlib.Path(f'./{method}/').mkdir(parents=True, exist_ok=True) 
    
    # Split multi-word names to get average embedding. 
    all_words = embedded_word.split(" ")
    
    # Define full context. 
    female_name = random.choice(female_names)
    male_name = random.choice(male_names)
    male_context = f"{male_name} {context} at the corporation {embedded_word}"
    female_context = f"{female_name} {context} at the corporation {embedded_word}"
    
    for word in all_words: 

        # Get embeddings for male and female context.
        male_embedding = pd.DataFrame(func.analyzeWord(word, male_context ,numberPolar = numPolar, model=model, tokenizer=tokenizer)).transpose()
        male_embedding = male_embedding.rename(columns={2: 'Value_Male'})
        male_embedding.drop([0, 1], axis = 1, inplace = True)

        female_embedding = pd.DataFrame(func.analyzeWord(word, female_context ,numberPolar = numPolar, model=model, tokenizer=tokenizer)).transpose()
        female_embedding = female_embedding.rename(columns={2: 'Value_Female'})

        # Merge embeddings. 
        df_merged = pd.merge(male_embedding, female_embedding, left_index=True, right_index=True)
        df_merged["female-male"] = df_merged["Value_Female"] - df_merged["Value_Male"]
        df_merged["tuple"] = list(zip(df_merged[0], df_merged[1]))
    
        embedding[[f"{word}_male", f"{word}_female", "tuple"]] = df_merged[["Value_Male", "Value_Female", "tuple"]]
        
    # Take average of different name parts. 
    embedding["average_male"] = embedding.loc[:, embedding.columns.str.endswith('_male')].mean(axis=1).round(7)
    embedding["average_female"] = embedding.loc[:, embedding.columns.str.endswith('_female')].mean(axis=1).round(7)

    # Calculate delta / bias. 
    embedding["female-male"] = (embedding["average_female"] - embedding["average_male"]).round(7)

    # Store for faster import later. 
    embedding.to_csv(f"./01_Embeddings/{method}/{embedded_word}_{context}.csv")