In [2]:
import pandas as pd

df = pd.read_json('./snli_1.0/snli_1.0_train.jsonl', lines=True)
df = df[['captionID', 'sentence1', 'sentence2']]

In [3]:
file = open('identity_terms.txt', 'r')
id_terms = []
for line in file:
    term = line.strip().lower()
    id_terms.append(term)


In [4]:
prems = df.drop_duplicates(subset=['captionID'])['sentence1'].to_dict()
hyps = df['sentence2'].drop_duplicates().to_dict()

In [5]:
import nltk
from nltk.corpus import stopwords
from collections import Counter, defaultdict
import math

from text_tools import process, count_words

stopwords = set(stopwords.words("english"))

def bias_info(target, corpus, single_count, threshold=10):
    N = len(corpus)
    pair_count = Counter()

    if single_count[target] == 0:
        return None

    pair_pmi = dict()
    pair_location = defaultdict(list) 
    
    for idx, uttr in corpus.items():
        if target in uttr:
            for w in uttr:
                pair_count[w] += 1 
                pair_location[w].append(idx)
    
    for w in pair_count:
        if single_count[w] >= threshold:
            pmi = math.log2(N * pair_count[w]) - math.log2(single_count[target] * single_count[w])
            pair_pmi[w] = pmi 

    return pair_pmi, pair_location


In [6]:
processed_prems = process(prems, stopwords)
single_count_prems = count_words(processed_prems)

processed_hyps = process(hyps, stopwords)
single_count_hyps = count_words(processed_prems)

In [7]:
prems_data = {
    'Identity Terms': id_terms,
    'Num of Prems' : [single_count_prems[term] for term in id_terms]
}

prems_id_count = pd.DataFrame(prems_data)
prems_id_count = prems_id_count[prems_id_count['Num of Prems'] >= 10]
prems_id_count = prems_id_count.sort_values(by='Num of Prems', ascending=False)

print(prems_id_count.head(20))

    Identity Terms  Num of Prems
2              man         38667
0            woman         20358
125          young         12161
35           white         11769
32           black         10932
4             girl          8912
3              men          8900
6              boy          8735
1            women          4926
5            girls          2505
33           asian          2184
7             boys          1862
121            old          1477
14            male          1360
13          female          1276
122        elderly           958
37        american           558
36         african           517
15          mother           353
49          indian           291


In [8]:
hyps_data = {
    'Identity Terms': id_terms,
    'Num of Hyps' : [single_count_hyps[term] for term in id_terms]
}

hyps_id_count = pd.DataFrame(hyps_data)
hyps_id_count = hyps_id_count[hyps_id_count['Num of Hyps'] >= 10]
hyps_id_count = hyps_id_count.sort_values(by='Num of Hyps', ascending=False)

print(hyps_id_count.head(20))

    Identity Terms  Num of Hyps
2              man        38667
0            woman        20358
125          young        12161
35           white        11769
32           black        10932
4             girl         8912
3              men         8900
6              boy         8735
1            women         4926
5            girls         2505
33           asian         2184
7             boys         1862
121            old         1477
14            male         1360
13          female         1276
122        elderly          958
37        american          558
36         african          517
15          mother          353
49          indian          291


In [14]:
select_id_terms = [
    'man', 
    'men',
    'boy',
    'male',
    'woman',
    'women',
    'girl',
    'female',
    'caucasian',
    'african',
    'asian'
]

In [15]:
prems_bias = defaultdict(dict)
for term in select_id_terms:
    pair_pmi, pair_locations = bias_info(term, processed_prems, single_count_prems)
    prems_bias[term]['pmi'] = pair_pmi
    prems_bias[term]['locations'] = pair_locations

hyps_bias = defaultdict(dict)
for term in select_id_terms:
    pair_pmi, pair_locations = bias_info(term, processed_hyps, single_count_hyps)
    hyps_bias[term]['pmi'] = pair_pmi
    hyps_bias[term]['locations'] = pair_locations


In [25]:
output_pairs = prems_bias['asian']['pmi']
output_df = pd.DataFrame.from_dict(output_pairs, orient='index', columns=['PMI']).reset_index()
output_df.columns = ['Terms', 'PMI']
output_df = output_df.sort_values(by='PMI', ascending=False)
print(output_df.head(15))


           Terms       PMI
10         asian  6.113303
403    southeast  5.627876
99       descent  5.275054
267         east  4.528341
315     language  4.265306
1315      pastel  3.997826
1546      likely  3.911669
1803     patrick  3.890911
1334      petals  3.890911
752         pigs  3.791375
194    styrofoam  3.791375
2102  schoolgirl  3.791375
1918     shading  3.791375
398       badges  3.791375
831      fabrics  3.791375


In [26]:
prems_bias['asian']['locations']['schoolgirl']

[496623, 496632]

In [27]:
df.iloc[496632].to_dict()

{'captionID': '4837948080.jpg#0',
 'sentence1': 'An Asian schoolgirl in a typical schoolgirl uniform, dark pullover, plaid skirt, and knee socks with penny loafers.',
 'sentence2': 'A girl is wearing a shirt, skirt, socks, and shoes.'}