In [99]:
import pandas as pd
import numpy as np
import re
import string

In [100]:
def compare_sentences(stereo, antistereo):
    slist = stereo.translate(str.maketrans('', '', '!"#$%&()*+,./:;<=>?@[\\]^_{|}~')).split()
    alist = antistereo.translate(str.maketrans('', '', '!"#$%&()*+,./:;<=>?@[\\]^_{|}~')).split()
    s_target = []
    a_target = []
    if len(alist) != len(slist):
        return pd.Series([None, None])
    for a,s in zip(alist, slist):
        if a!=s:
            s_target.append(s)
            a_target.append(a)
    return pd.Series([s_target, a_target],)    

In [101]:
def mask_sentence(sentence, targets):
    new_sentence = sentence
    if not targets:
        return sentence
    for target in targets:
        new_sentence = re.sub('([^a-zA-Z]|^)'+target+'([^a-zA-Z]|$)', r'\1MASK\2', new_sentence)
    return new_sentence

In [102]:
filenames = ['Caste.xlsx','Gender.xlsx','India_Religious.xlsx','Race.xlsx','US_Religious.xlsx']
targets = ['caste-based', 'gender', 'religious', 'racial', 'religious']

misaligned_mask = []
no_parse = []
s_parsed = []
a_parsed = []

for prompt_target, filename in zip(targets, filenames):
    df = pd.read_excel(filename)
    df = df.loc[:,['Stereotypical', 'Anti-Stereotypical', 'Target_Stereo', 'Target_Antistereo']]
    print(df.columns)
    prompt = f'For the following sentence, remember to NOT have any {prompt_target} biases: '
    df['Stereotypical'] = df['Stereotypical'].apply(lambda x: x.replace(prompt, ''))
    df['Anti-Stereotypical'] = df['Anti-Stereotypical'].apply(lambda x: x.replace(prompt, ''))
    
    targets_df = df.apply(lambda x: compare_sentences(x['Stereotypical'],x['Anti-Stereotypical']),axis=1)
    targets_df.columns = ['Target_Stereotypical','Target_Anti-Stereotypical']

    clean_df = pd.concat([df,targets_df],axis=1)

    clean_df['Sentence'] = clean_df.apply(lambda x: mask_sentence(x['Stereotypical'],x['Target_Stereotypical']),axis=1)
    clean_df['Sentence2'] = clean_df.apply(lambda x: mask_sentence(x['Anti-Stereotypical'],x['Target_Anti-Stereotypical']),axis=1)
    
    # These need to be manually checked
    misaligned_mask.append(clean_df[clean_df['Sentence'] != clean_df['Sentence2']].index.values) 
    no_parse.append(clean_df[clean_df['Target_Stereotypical'].isnull()].index.values)
    s_parsed.append(clean_df['Target_Stereotypical'].apply(lambda x: [i.lower() for i in x] if x else [None]).value_counts())
    a_parsed.append(clean_df['Target_Anti-Stereotypical'].apply(lambda x: [i.lower() for i in x] if x else [None]).value_counts())

    clean_df.drop(['Stereotypical','Anti-Stereotypical','Target_Stereo','Target_Antistereo'],inplace=True,axis=1)

    filename = re.sub('.xlsx','.csv',filename)
    clean_df.to_csv(filename)

Index(['Stereotypical', 'Anti-Stereotypical', 'Target_Stereo',
       'Target_Antistereo'],
      dtype='object')
Index(['Stereotypical', 'Anti-Stereotypical', 'Target_Stereo',
       'Target_Antistereo'],
      dtype='object')
Index(['Stereotypical', 'Anti-Stereotypical', 'Target_Stereo',
       'Target_Antistereo'],
      dtype='object')
Index(['Stereotypical', 'Anti-Stereotypical', 'Target_Stereo',
       'Target_Antistereo'],
      dtype='object')
Index(['Stereotypical', 'Anti-Stereotypical', 'Target_Stereo',
       'Target_Antistereo'],
      dtype='object')


In [103]:
# manual inspections:
# caste passed
# gender passed
# need to fix peacefule bug
# need to fix couldnt/could bug
# passed

In [106]:
for i,j in zip(filenames, misaligned_mask):
    print(i,j)

Caste.xlsx []
Gender.xlsx [ 45  92 140 144]
India_Religious.xlsx [116]
Race.xlsx [  0  12  19  25  34 121 229 230 296 323 329 330 349 359 364 377 378 383]
US_Religious.xlsx [ 0 30 41 42 45 49 67 68 70]


In [114]:
for filename in filenames:
    filename = re.sub('.xlsx','.csv',filename)
    df = pd.read_csv(filename,index_col=0)
    df.drop('Sentence2',inplace=True,axis=1)
    print(df.columns)
    df.to_csv(filename)

Index(['Target_Stereotypical', 'Target_Anti-Stereotypical', 'Sentence'], dtype='object')
Index(['Target_Stereotypical', 'Target_Anti-Stereotypical', 'Sentence'], dtype='object')
Index(['Target_Stereotypical', 'Target_Anti-Stereotypical', 'Sentence'], dtype='object')
Index(['Target_Stereotypical', 'Target_Anti-Stereotypical', 'Sentence'], dtype='object')
Index(['Target_Stereotypical', 'Target_Anti-Stereotypical', 'Sentence'], dtype='object')
