In [1]:
import pandas as pd

df = pd.read_json('./snli_1.0/snli_1.0_train.jsonl', lines=True)
df = df[['captionID', 'sentence1', 'sentence2']]

In [2]:
file = open('identity_terms.txt', 'r')
id_terms = []
for line in file:
    term = line.strip().lower()
    id_terms.append(term)


In [3]:
prems = df.drop_duplicates(subset=['captionID'])['sentence1'].to_dict()
hyps = df['sentence2'].drop_duplicates().to_dict()

In [4]:
import nltk
from nltk.corpus import stopwords
from collections import Counter, defaultdict
import math

from text_tools import process, count_words

stopwords = set(stopwords.words("english"))

def bias_info(target, corpus, single_count, threshold=10):
    N = len(corpus)
    pair_count = Counter()

    if single_count[target] == 0:
        return None

    pair_pmi = dict()
    pair_location = defaultdict(list) 
    
    for idx, uttr in corpus.items():
        if target in uttr:
            for w in uttr:
                pair_count[w] += 1 
                pair_location[w].append(idx)
    
    for w in pair_count:
        if single_count[w] >= threshold:
            pmi = math.log2(N * pair_count[w]) - math.log2(single_count[target] * single_count[w])
            pair_pmi[w] = pmi 

    return pair_pmi, pair_location


In [5]:
processed_prems = process(prems, stopwords)
single_count_prems = count_words(processed_prems)

processed_hyps = process(hyps, stopwords)
single_count_hyps = count_words(processed_hyps)

In [6]:
prems_data = {
    'Identity Terms': id_terms,
    'Num of Prems' : [single_count_prems[term] for term in id_terms]
}

prems_id_count = pd.DataFrame(prems_data)
prems_id_count = prems_id_count[prems_id_count['Num of Prems'] >= 10]
prems_id_count = prems_id_count.sort_values(by='Num of Prems', ascending=False)

print(prems_id_count.head(20))

    Identity Terms  Num of Prems
2              man         38667
0            woman         20358
125          young         12161
35           white         11769
32           black         10932
4             girl          8912
3              men          8900
6              boy          8735
1            women          4926
5            girls          2505
33           asian          2184
7             boys          1862
121            old          1477
14            male          1360
13          female          1276
122        elderly           958
37        american           558
36         african           517
15          mother           353
49          indian           291


In [7]:
hyps_data = {
    'Identity Terms': id_terms,
    'Num of Hyps' : [single_count_hyps[term] for term in id_terms]
}

hyps_id_count = pd.DataFrame(hyps_data)
hyps_id_count = hyps_id_count[hyps_id_count['Num of Hyps'] >= 10]
hyps_id_count = hyps_id_count.sort_values(by='Num of Hyps', ascending=False)

print(hyps_id_count.head(20))

    Identity Terms  Num of Hyps
2              man        94829
0            woman        50210
3              men        24395
4             girl        22675
6              boy        21951
1            women        14316
125          young        14230
32           black         7538
5            girls         7064
35           white         6520
7             boys         5210
121            old         5085
13          female         2333
33           asian         2244
15          mother         2119
14            male         1797
20             son         1475
16          father         1366
122        elderly         1256
19        daughter         1092


In [8]:
select_id_terms = [
    'man', 
    'men',
    'boy',
    'male',
    'woman',
    'women',
    'girl',
    'female',
    'caucasian',
    'african',
    'asian'
]

In [9]:
prems_bias = defaultdict(dict)
for term in select_id_terms:
    pair_pmi, pair_locations = bias_info(term, processed_prems, single_count_prems)
    prems_bias[term]['pmi'] = pair_pmi
    prems_bias[term]['locations'] = pair_locations

hyps_bias = defaultdict(dict)
for term in select_id_terms:
    pair_pmi, pair_locations = bias_info(term, processed_hyps, single_count_hyps)
    hyps_bias[term]['pmi'] = pair_pmi
    hyps_bias[term]['locations'] = pair_locations


In [23]:
output_pairs = hyps_bias['asian']['pmi']
output_df = pd.DataFrame.from_dict(output_pairs, orient='index', columns=['PMI']).reset_index()
output_df.columns = ['Terms', 'PMI']
output_df = output_df.sort_values(by='PMI', ascending=False)
print(output_df.head(6))


          Terms       PMI
1         asian  7.740944
748   southeast  7.615413
213     descent  6.489405
811     symbols  6.155982
184    heritage  5.933589
1045  styrofoam  5.740944


In [88]:
prems_bias['man']['locations']['rearing']

[154947,
 196233,
 257484,
 262875,
 262887,
 262893,
 390648,
 409659,
 523746,
 543879]

In [89]:
df.iloc[262887].to_dict()

{'captionID': '3104831222.jpg#1',
 'sentence1': 'A man is hanging on to a rearing horse with brush in the background.',
 'sentence2': 'The woman brushed the horse.'}

In [90]:
(df[df['captionID'] == '3104831222.jpg#1']).to_dict()

{'captionID': {262887: '3104831222.jpg#1',
  262888: '3104831222.jpg#1',
  262889: '3104831222.jpg#1'},
 'sentence1': {262887: 'A man is hanging on to a rearing horse with brush in the background.',
  262888: 'A man is hanging on to a rearing horse with brush in the background.',
  262889: 'A man is hanging on to a rearing horse with brush in the background.'},
 'sentence2': {262887: 'The woman brushed the horse.',
  262888: 'The man hung onto the horse after it became spooked.',
  262889: 'The man hung onto the horse.'}}