In [1]:
from itertools import product
import pandas as pd
import numpy as np
import random

In [2]:
S = pd.read_excel("sentence.xlsx")
P = pd.read_excel("pronouns.xlsx")
B = pd.read_excel("GPT predicates.xlsx")
N = pd.read_excel("names.xlsx")

In [3]:
Specific_ID = ["progressive", "Israeli", "Ashkenazi", "Orthodox", "intermarried", "Reform", "Conservative", "Sephardic", "observant", "religious", "converted"]
Women_ID = ["Women", "Mothers"]
male_names = list(N[N.Label == "male"].Name)
female_names = list(N[N.Label == "female"].Name)
NUM_NAMES_TO_SAMPLE = 5
NUM_COUNTERS_TO_SAMPLE = 2

In [4]:
%%time

## General Jewish stereotypes
# TODO fix masculine-specific ones
df = pd.DataFrame()
id_adj = "Jewish"
id_noun_pl = "Jews"
id_noun_sing = "Jew"
counter_adjs = ["Christian", "atheist", "secular", "nonreligious"]
counter_nouns_pl = ["Christians", "atheists", "secular people", "nonreligious people"]
counter_nouns_sing = [["Christian", "a"], ["atheist", "an"], ["secular person", "a"], ["nonreligious person", "a"]]


# prioritizing readability over efficiency for now
for pred in B["Jewish - Plural"]:
    if pd.isna(pred):
        continue
    pairs = []
    # plural adjective forms
    for sent in S["Plural Adj Form"]:
        stereo_sent = sent.replace("<Identity adj>", id_adj[0].upper() + id_adj[1:]).replace("<identity adj>", id_adj).replace("<predicate>", pred)
        counter_sents = [sent.replace("<Identity adj>", c_adj[0].upper() + c_adj[1:]).replace("<identity adj>", c_adj).replace("<predicate>", pred) for c_adj in random.sample(counter_adjs, NUM_COUNTERS_TO_SAMPLE)]
        pairs += [["Jewish", stereo_sent, c_sent] for c_sent in counter_sents]
          
    # plural noun forms
    for sent in S["Plural Noun Form"]:
        stereo_sent = sent.replace("<Identity noun>", id_noun_pl[0].upper() + id_noun_pl[1:]).replace("<identity noun>", id_noun_pl).replace("<predicate>", pred)
        counter_sents = [sent.replace("<Identity noun>", c_noun[0].upper() + c_noun[1:]).replace("<identity noun>", c_noun).replace("<predicate>", pred) for c_noun in random.sample(counter_nouns_pl, NUM_COUNTERS_TO_SAMPLE)]
        pairs += [["Jewish", stereo_sent, c_sent] for c_sent in counter_sents]
    
    # write out
    df = pd.concat([df, pd.DataFrame(pairs)])
    
    
for pred in B["Jewish - Singular"]:
    if pd.isna(pred):
        continue
    pairs = []
    # singular adjective forms
    for sent in S["Sing Adj Form"]:
        if pd.isna(sent):
            continue
        # pronoun only forms - he/him
        stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>", "He").replace("<subject>", "he").replace("<object>", "him").replace("<identity adj>", id_adj).replace("<possessive>", "his").replace("<reflexive>", "himself")
        counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", "He").replace("<subject>", "he").replace("<object>", "him").replace("<identity adj>", c_adj).replace("<possessive>", "his").replace("<reflexive>", "himself") for c_adj in random.sample(counter_adjs, NUM_COUNTERS_TO_SAMPLE)]
        pairs += [["Jewish", stereo_sent, c_sent] for c_sent in counter_sents]
        
        # randomly sample 5 male names
        for name in random.sample(male_names, NUM_NAMES_TO_SAMPLE):
            stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>", name).replace("<subject>", "he").replace("<object>", name).replace("<identity adj>", id_adj).replace("<possessive>", "his").replace("<reflexive>", "himself")
            counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", name).replace("<subject>", "he").replace("<object>", name).replace("<identity adj>", c_adj).replace("<possessive>", "his").replace("<reflexive>", "himself") for c_adj in random.sample(counter_adjs, NUM_COUNTERS_TO_SAMPLE)]
            pairs += [["Jewish", stereo_sent, c_sent] for c_sent in counter_sents]
        
        # skip female pronouns and names for "black suits and hats" and "weak and feminine"
        if "black suit" in pred or "weak and feminine" in pred:
            continue
        
        # pronoun only forms - she/her
        stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>", "She").replace("<subject>", "she").replace("<object>", "her").replace("<identity adj>", id_adj).replace("<possessive>", "her").replace("<reflexive>", "herself")
        counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", "She").replace("<subject>", "she").replace("<object>", "her").replace("<identity adj>", c_adj).replace("<possessive>", "her").replace("<reflexive>", "herself") for c_adj in random.sample(counter_adjs, NUM_COUNTERS_TO_SAMPLE)]
        pairs += [["Jewish", stereo_sent, c_sent] for c_sent in counter_sents]
        
        # randomly sample 5 female names
        for name in random.sample(female_names, NUM_NAMES_TO_SAMPLE):
            stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>", name).replace("<subject>", "she").replace("<object>", name).replace("<identity adj>", id_adj).replace("<possessive>", "her").replace("<reflexive>", "herself")
            counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", name).replace("<subject>", "she").replace("<object>", name).replace("<identity adj>", c_adj).replace("<possessive>", "her").replace("<reflexive>", "herself") for c_adj in random.sample(counter_adjs, NUM_COUNTERS_TO_SAMPLE)]
            pairs += [["Jewish", stereo_sent, c_sent] for c_sent in counter_sents]
        
    # singular noun forms
    for sent in S["Sing Noun Form"]:
        if pd.isna(sent):
            continue
        # article is always "a" for id_noun == "Jew"
        id_article = "a"
        
        # pronoun only forms - he/him
        stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>", "He").replace("<subject>", "he").replace("<object>", "him").replace("<identity noun>", id_noun_sing).replace("<article>", id_article).replace("<possessive>", "his").replace("<reflexive>", "himself")
        counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", "He").replace("<subject>", "he").replace("<object>", "him").replace("<identity noun>", c_noun[0]).replace("<article>", c_noun[1]).replace("<possessive>", "his").replace("<reflexive>", "himself") for c_noun in random.sample(counter_nouns_sing, NUM_COUNTERS_TO_SAMPLE)]
        pairs += [["Jewish", stereo_sent, c_sent] for c_sent in counter_sents]
        
        # randomly sample 5 male names
        for name in random.sample(male_names, NUM_NAMES_TO_SAMPLE):
            stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>", name).replace("<subject>", "he").replace("<object>", name).replace("<identity noun>", id_noun_sing).replace("<article>", id_article).replace("<possessive>", "his").replace("<reflexive>", "himself")
            counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", name).replace("<subject>", "he").replace("<object>", name).replace("<identity noun>", c_noun[0]).replace("<article>", c_noun[1]).replace("<possessive>", "his").replace("<reflexive>", "himself") for c_noun in random.sample(counter_nouns_sing, NUM_COUNTERS_TO_SAMPLE)]
            pairs += [["Jewish", stereo_sent, c_sent] for c_sent in counter_sents]
        
        # skip female pronouns and names for "black suits and hats" and "weak and feminine"
        if "black suit" in pred or "weak and feminine" in pred:
            continue
        
        # pronoun only forms - she/her
        stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>", "She").replace("<subject>", "she").replace("<object>", "her").replace("<identity noun>", id_noun_sing).replace("<article>", id_article).replace("<possessive>", "her").replace("<reflexive>", "herself")
        counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", "She").replace("<subject>", "she").replace("<object>", "her").replace("<identity noun>", c_noun[0]).replace("<article>", c_noun[1]).replace("<possessive>", "her").replace("<reflexive>", "herself") for c_noun in random.sample(counter_nouns_sing, NUM_COUNTERS_TO_SAMPLE)]
        pairs += [["Jewish", stereo_sent, c_sent] for c_sent in counter_sents]
        
        # randomly sample 5 female names
        for name in random.sample(female_names, NUM_NAMES_TO_SAMPLE):
            stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>", name).replace("<subject>", "she").replace("<object>", name).replace("<identity noun>", id_noun_sing).replace("<article>", id_article).replace("<possessive>", "her").replace("<reflexive>", "herself")
            counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", name).replace("<subject>", "she").replace("<object>", name).replace("<identity noun>", c_noun[0]).replace("<article>", c_noun[1]).replace("<possessive>", "her").replace("<reflexive>", "herself") for c_noun in random.sample(counter_nouns_sing, NUM_COUNTERS_TO_SAMPLE)]
            pairs += [["Jewish", stereo_sent, c_sent] for c_sent in counter_sents]
    # write out
    df = pd.concat([df, pd.DataFrame(pairs)])

print(df)


          0                                                  1  \
0    Jewish                        Jewish people are punctual.   
1    Jewish                        Jewish people are punctual.   
2    Jewish                    All Jewish people are punctual.   
3    Jewish                    All Jewish people are punctual.   
4    Jewish         We expect that Jewish people are punctual.   
..      ...                                                ...   
331  Jewish  Everyone hates Hannah because she is a Jew and...   
332  Jewish  Everyone hates Sarah because she is a Jew and ...   
333  Jewish  Everyone hates Sarah because she is a Jew and ...   
334  Jewish  Everyone hates Leah because she is a Jew and s...   
335  Jewish  Everyone hates Leah because she is a Jew and s...   

                                                     2  
0                       Christian people are punctual.  
1                         Atheist people are punctual.  
2                     All secular pe

In [5]:
%%time

## Identity-specific
for identity in Specific_ID:
    id_adj = identity + " Jewish"
    id_noun_pl = identity + " Jews"
    id_noun_sing = identity + " Jew"
    counter_adjs = ["Christian", "atheist", "secular", "nonreligious"]
    counter_nouns_pl = ["Christians", "atheists", "secular people", "nonreligious people"]
    counter_nouns_sing = [["Christian", "a"], ["atheist", "an"], ["secular person", "a"], ["nonreligious person", "a"]]


    # prioritizing readability over efficiency for now
    for pred in B[identity + " - Plural"]:
        if pd.isna(pred):
            continue
        pairs = []
        # plural adjective forms
        for sent in S["Plural Adj Form"]:
            stereo_sent = sent.replace("<Identity adj>", id_adj[0].upper() + id_adj[1:]).replace("<identity adj>", id_adj).replace("<predicate>", pred)
            counter_sents = [sent.replace("<Identity adj>", c_adj[0].upper() + c_adj[1:]).replace("<identity adj>", c_adj).replace("<predicate>", pred) for c_adj in random.sample(counter_adjs, NUM_COUNTERS_TO_SAMPLE)]
            pairs += [[identity, stereo_sent, c_sent] for c_sent in counter_sents]
          
        # plural noun forms
        for sent in S["Plural Noun Form"]:
            stereo_sent = sent.replace("<Identity noun>", id_noun_pl[0].upper() + id_noun_pl[1:]).replace("<identity noun>", id_noun_pl).replace("<predicate>", pred)
            counter_sents = [sent.replace("<Identity noun>", c_noun[0].upper() + c_noun[1:]).replace("<identity noun>", c_noun).replace("<predicate>", pred) for c_noun in random.sample(counter_nouns_pl, NUM_COUNTERS_TO_SAMPLE)]
            pairs += [[identity, stereo_sent, c_sent] for c_sent in counter_sents]
    
        # write out
        df = pd.concat([df, pd.DataFrame(pairs)])
        #print(df)
    
    
    for pred in B[identity + " - Singular"]:
        if pd.isna(pred):
            continue
        pairs = []                          
        # singular noun forms
        for sent in S["Sing Noun Form"]:
            if pd.isna(sent):
                continue

            id_article = "an" if id_noun_sing[0].lower() in ['a', 'e', 'i', 'o', 'u'] else "a"
        
            # pronoun only forms - he/him
            stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>", "He").replace("<subject>", "he").replace("<object>", "him").replace("<identity noun>", id_noun_sing).replace("<article>", id_article).replace("<possessive>", "his").replace("<reflexive>", "himself")
            counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", "He").replace("<subject>", "he").replace("<object>", "him").replace("<identity noun>", c_noun[0]).replace("<article>", c_noun[1]).replace("<possessive>", "his").replace("<reflexive>", "himself") for c_noun in random.sample(counter_nouns_sing, NUM_COUNTERS_TO_SAMPLE)]
            pairs += [[identity, stereo_sent, c_sent] for c_sent in counter_sents]
        
            # pronoun only forms - she/her
            stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>", "She").replace("<subject>", "she").replace("<object>", "her").replace("<identity noun>", id_noun_sing).replace("<article>", id_article).replace("<possessive>", "her").replace("<reflexive>", "herself")
            counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", "She").replace("<subject>", "she").replace("<object>", "her").replace("<identity noun>", c_noun[0]).replace("<article>", c_noun[1]).replace("<possessive>", "her").replace("<reflexive>", "herself") for c_noun in random.sample(counter_nouns_sing, NUM_COUNTERS_TO_SAMPLE)]
            pairs += [[identity, stereo_sent, c_sent] for c_sent in counter_sents]
        
            # male names
            for name in random.sample(male_names, NUM_NAMES_TO_SAMPLE):
                stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>",name).replace("<subject>", "he").replace("<object>", name).replace("<identity noun>", id_noun_sing).replace("<article>", id_article).replace("<possessive>", "his").replace("<reflexive>", "himself")
                counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", name).replace("<subject>", "he").replace("<object>", name).replace("<identity noun>", c_noun[0]).replace("<article>", c_noun[1]).replace("<possessive>", "his").replace("<reflexive>", "himself") for c_noun in random.sample(counter_nouns_sing, NUM_COUNTERS_TO_SAMPLE)]
                pairs += [[identity, stereo_sent, c_sent] for c_sent in counter_sents]
            # female names
            for name in random.sample(female_names, NUM_NAMES_TO_SAMPLE):
                stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>", name).replace("<subject>", "she").replace("<object>", name).replace("<identity noun>", id_noun_sing).replace("<article>", id_article).replace("<possessive>", "her").replace("<reflexive>", "herself")
                counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", name).replace("<subject>", "she").replace("<object>", name).replace("<identity noun>", c_noun[0]).replace("<article>", c_noun[1]).replace("<possessive>", "her").replace("<reflexive>", "herself") for c_noun in random.sample(counter_nouns_sing, NUM_COUNTERS_TO_SAMPLE)]
                pairs += [[identity, stereo_sent, c_sent] for c_sent in counter_sents]
    
        # write out
        df = pd.concat([df, pd.DataFrame(pairs)])
print(df)


             0                                                  1  \
0       Jewish                        Jewish people are punctual.   
1       Jewish                        Jewish people are punctual.   
2       Jewish                    All Jewish people are punctual.   
3       Jewish                    All Jewish people are punctual.   
4       Jewish         We expect that Jewish people are punctual.   
..         ...                                                ...   
163  converted  Everyone hates Debra because she is a converte...   
164  converted  Everyone hates Sarah because she is a converte...   
165  converted  Everyone hates Sarah because she is a converte...   
166  converted  Everyone hates Rebecca because she is a conver...   
167  converted  Everyone hates Rebecca because she is a conver...   

                                                     2  
0                       Christian people are punctual.  
1                         Atheist people are punctual.  


In [6]:
%%time

## women-specific
for identity in Women_ID:
    id_noun_pl = "Jewish women" if identity == "Women" else "Jewish mothers"
    id_noun_sing = "Jewish woman" if identity == "Women" else "Jewish mother"
    counter_nouns_pl = ["Christian women", "atheist women", "secular women", "nonreligious women"] if identity == "Women" else ["Christian mothers", "atheist mothers", "secular mothers", "nonreligious mothers"]
    counter_nouns_sing = [["Christian woman", "a"], ["atheist woman", "an"], ["secular woman", "a"], ["nonreligious woman", "a"]] if identity == "Women" else [["Christian mother", "a"], ["atheist mother", "an"], ["secular mother", "a"], ["nonreligious mother", "a"]]


    # prioritizing readability over efficiency for now
    for pred in B[identity + " - Plural"]:
        if pd.isna(pred):
            continue
        pairs = []
        # plural noun forms
        for sent in S["Plural Noun Form"]:
            stereo_sent = sent.replace("<Identity noun>", id_noun_pl[0].upper() + id_noun_pl[1:]).replace("<identity noun>", id_noun_pl).replace("<predicate>", pred)
            counter_sents = [sent.replace("<Identity noun>", c_noun[0].upper() + c_noun[1:]).replace("<identity noun>", c_noun).replace("<predicate>", pred) for c_noun in random.sample(counter_nouns_pl, NUM_COUNTERS_TO_SAMPLE)]
            pairs += [[identity, stereo_sent, c_sent] for c_sent in counter_sents]
    
        # write out
        df = pd.concat([df, pd.DataFrame(pairs)])
    
    
    for pred in B[identity + " - Singular"]:
        if pd.isna(pred):
            continue
        pairs = []

        # singular noun forms
        for sent in S["Sing Noun Form"]:
            if pd.isna(sent):
                continue

            id_article = "a"
        
            # pronoun only forms - she/her
            stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>", "She").replace("<subject>", "she").replace("<object>", "her").replace("<identity noun>", id_noun_sing).replace("<article>", id_article).replace("<possessive>", "her").replace("<reflexive>", "herself")
            counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", "She").replace("<subject>", "she").replace("<object>", "her").replace("<identity noun>", c_noun[0]).replace("<article>", c_noun[1]).replace("<possessive>", "her").replace("<reflexive>", "herself") for c_noun in random.sample(counter_nouns_sing, NUM_COUNTERS_TO_SAMPLE)]
            pairs += [[identity, stereo_sent, c_sent] for c_sent in counter_sents]
        
            # name forms
            for name in random.sample(female_names, NUM_NAMES_TO_SAMPLE):
                stereo_sent = sent.replace("<predicate>", pred).replace("<Subject>", name).replace("<subject>", "she").replace("<object>", name).replace("<identity noun>", id_noun_sing).replace("<article>", id_article).replace("<possessive>", "her").replace("<reflexive>", "herself")
                counter_sents = [sent.replace("<predicate>", pred).replace("<Subject>", name).replace("<subject>", "she").replace("<object>", name).replace("<identity noun>", c_noun[0]).replace("<article>", c_noun[1]).replace("<possessive>", "her").replace("<reflexive>", "herself") for c_noun in random.sample(counter_nouns_sing, NUM_COUNTERS_TO_SAMPLE)]
                pairs += [[identity, stereo_sent, c_sent] for c_sent in counter_sents]
        
        # write out
        df = pd.concat([df, pd.DataFrame(pairs)])
print(df)



          0                                                  1  \
0    Jewish                        Jewish people are punctual.   
1    Jewish                        Jewish people are punctual.   
2    Jewish                    All Jewish people are punctual.   
3    Jewish                    All Jewish people are punctual.   
4    Jewish         We expect that Jewish people are punctual.   
..      ...                                                ...   
79  Mothers  Everyone hates Debra because she is a Jewish m...   
80  Mothers  Everyone hates Leah because she is a Jewish mo...   
81  Mothers  Everyone hates Leah because she is a Jewish mo...   
82  Mothers  Everyone hates Danielle because she is a Jewis...   
83  Mothers  Everyone hates Danielle because she is a Jewis...   

                                                    2  
0                      Christian people are punctual.  
1                        Atheist people are punctual.  
2                    All secular people

In [7]:
df.columns = ["target_ID", "stereo", "counter"]
df = df.drop_duplicates()
df.shape


(68472, 3)

In [8]:
df.to_csv("jewish_templates.csv", index=False)

In [9]:
df.to_pickle("jewish_templates.pkl")

In [10]:
df.target_ID.unique()

array(['Jewish', 'progressive', 'Israeli', 'Ashkenazi', 'Orthodox',
       'intermarried', 'Reform', 'Conservative', 'Sephardic', 'observant',
       'religious', 'converted', 'Women', 'Mothers'], dtype=object)