In [1]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# model = SentenceTransformer('all-mpnet-base-v2')
# model = SentenceTransformer('all-distilroberta-v1')
model = SentenceTransformer('all-MiniLM-L12-v2')

Downloading (…)5dded/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 1.07MB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 2.21MB/s]
Downloading (…)4d81d5dded/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 35.4MB/s]
Downloading (…)81d5dded/config.json: 100%|██████████| 573/573 [00:00<00:00, 2.37MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 586kB/s]
Downloading (…)ded/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 15.8MB/s]
Downloading pytorch_model.bin: 100%|██████████| 134M/134M [00:06<00:00, 21.0MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 340kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 737kB/s]
Downloading (…)5dded/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 9.45MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 352/352 [00:00<00:00, 1.15MB/s]
Downloading (…)dded/train_script.py: 100%|██

In [15]:
chatgpt = pd.read_csv('generated_text_final.csv')

In [16]:
bm = chatgpt.query('race == "African" and gender == "man"')
bw = chatgpt.query('race == "African" and gender == "woman"')

am = chatgpt.query('race == "Asian" and gender == "man"')
aw = chatgpt.query('race == "Asian" and gender == "woman"')

hm = chatgpt.query('race == "Hispanic" and gender == "man"')
hw = chatgpt.query('race == "Hispanic" and gender == "woman"')

wm = chatgpt.query('race == "White" and gender == "man"')
ww = chatgpt.query('race == "White" and gender == "woman"')


In [17]:
bm_texts_by_format = [bm.groupby('format').get_group(x).text for x in chatgpt.format.unique()]
bw_texts_by_format = [bw.groupby('format').get_group(x).text for x in chatgpt.format.unique()]

am_texts_by_format = [am.groupby('format').get_group(x).text for x in chatgpt.format.unique()]
aw_texts_by_format = [aw.groupby('format').get_group(x).text for x in chatgpt.format.unique()]

hm_texts_by_format = [hm.groupby('format').get_group(x).text for x in chatgpt.format.unique()]
hw_texts_by_format = [hw.groupby('format').get_group(x).text for x in chatgpt.format.unique()]

wm_texts_by_format = [wm.groupby('format').get_group(x).text for x in chatgpt.format.unique()]
ww_texts_by_format = [ww.groupby('format').get_group(x).text for x in chatgpt.format.unique()]

In [18]:
def return_cosines(list_of_text):
    embedding = model.encode(list_of_text)
    cosines = util.cos_sim(embedding, embedding)
    return(cosines[np.triu_indices(len(list_of_text),1)])

In [19]:
bm_cosines = [return_cosines(list(x)) for x in bm_texts_by_format]
bw_cosines = [return_cosines(list(x)) for x in bw_texts_by_format]

am_cosines = [return_cosines(list(x)) for x in am_texts_by_format]
aw_cosines = [return_cosines(list(x)) for x in aw_texts_by_format]

hm_cosines = [return_cosines(list(x)) for x in hm_texts_by_format]
hw_cosines = [return_cosines(list(x)) for x in hw_texts_by_format]

wm_cosines = [return_cosines(list(x)) for x in wm_texts_by_format]
ww_cosines = [return_cosines(list(x)) for x in ww_texts_by_format]


In [20]:
bmc = np.array([x for row in bm_cosines for x in row])
bwc = np.array([x for row in bw_cosines for x in row])

amc = np.array([x for row in am_cosines for x in row])
awc = np.array([x for row in aw_cosines for x in row])

hmc = np.array([x for row in hm_cosines for x in row])
hwc = np.array([x for row in hw_cosines for x in row])

wmc = np.array([x for row in wm_cosines for x in row])
wwc = np.array([x for row in ww_cosines for x in row])

In [11]:
group_length = len(bmc)
format_length = len(bm_cosines[0])

race_list = np.repeat(['African Americans', 'Asian Americans', 'Hispanic Americans', 'White Americans'],  group_length * 2)
gender_list = np.tile(np.repeat(["Man", "Woman"], group_length), 4)
format_list = np.tile(np.repeat(chatgpt.format.unique(), format_length), 8)
cosine_list = np.concatenate([bmc, bwc, amc, awc, hmc, hwc, wmc, wwc])

cosine_df = pd.DataFrame(list(zip(race_list, gender_list, format_list, cosine_list)), columns = ['race', 'gender', 'format', 'cosine'])

In [13]:
import feather
# path = 'all-mpnet-base-v2.feather'
# path = 'all_distilroberta_v1.feather'
path = 'all-MiniLM-L12-v2.feather'
feather.write_dataframe(cosine_df, path)