In [1]:
import src.utilities.utils as utils 
test_data = utils.read_jsonl('data/snli_1.0/snli_1.0_test.jsonl')
train_data = utils.read_jsonl('data/snli_1.0/snli_1.0_train.jsonl')
dev_data = utils.read_jsonl('data/snli_1.0/snli_1.0_dev.jsonl')

In [2]:
nums = ['2','3','4','5','6','7','8','9','10']
special_numericals = ['some', 'many', 'all'] # TODO
number_words = utils.plural_words_for_numbers + nums

convert_dict_plural = {
    'two':2,
    'three':3,
    'four':4,
    'five':5,
    'six':6,
    'seven':7,
    'eight':8,
    'nine':9,
    'ten':10,
    '2':2,
    '3':3,
    '4':4,
    '5':5,
    '6':6,
    '7':7,
    '8':8,
    '9':9,
    '10':10
}

def numerical_word_in_sentence(sentence):
    return [x for x in sentence.split() if x in convert_dict_plural]

def subset_with_numericals(dataset):
    result = []
    for data in dataset:
        sent1_num_words = numerical_word_in_sentence(data['sentence1'])
        sent2_num_words = numerical_word_in_sentence(data['sentence2'])

        if sent1_num_words or sent2_num_words:
#             print((sent1_num_words, sent2_num_words))
#             print(data['gold_label'])
#             print(data['sentence1'])
#             print(data['sentence2'])
#             print()
            result.append(data)
    return result

def subset_with_same_numericals(dataset):
    result = []
    for data in dataset:
        sent1_num_words = numerical_word_in_sentence(data['sentence1'])
        sent2_num_words = numerical_word_in_sentence(data['sentence2'])

        if sent1_num_words == sent2_num_words and len(sent1_num_words) == 1: # TODO: When len(list) > 1
#             print((sent1_num_words, sent2_num_words))
#             print(data['gold_label'])
#             print(data['sentence1'])
#             print(data['sentence2'])
#             print()
            result.append(data)
    return result

In [3]:
def subset_by_relationship_type(dataset, relation):
    result = []
    return [data for data in dataset if data['gold_label'] == relation]

In [4]:
test_data_entailment = subset_by_relationship_type(test_data, 'entailment')
train_data_entailment = subset_by_relationship_type(train_data, 'entailment')
dev_data_entailment = subset_by_relationship_type(dev_data, 'entailment')
# test_data_contradiction = subset_by_relationship_type(test_data, 'contradiction')
# test_data_neutral = subset_by_relationship_type(test_data, 'neutral')

In [5]:
len(dev_data_entailment)

3329

In [6]:
# Entailment pairs in test_data
test_data_entailment

[{'annotator_labels': ['entailment',
   'entailment',
   'entailment',
   'neutral',
   'entailment'],
  'captionID': '2677109430.jpg#1',
  'gold_label': 'entailment',
  'pairID': '2677109430.jpg#1r1e',
  'sentence1': 'This church choir sings to the masses as they sing joyous songs from the book at a church.',
  'sentence1_binary_parse': '( ( This ( church choir ) ) ( ( ( sings ( to ( the masses ) ) ) ( as ( they ( ( sing ( joyous songs ) ) ( from ( ( the book ) ( at ( a church ) ) ) ) ) ) ) ) . ) )',
  'sentence1_parse': '(ROOT (S (NP (DT This) (NN church) (NN choir)) (VP (VBZ sings) (PP (TO to) (NP (DT the) (NNS masses))) (SBAR (IN as) (S (NP (PRP they)) (VP (VBP sing) (NP (JJ joyous) (NNS songs)) (PP (IN from) (NP (NP (DT the) (NN book)) (PP (IN at) (NP (DT a) (NN church))))))))) (. .)))',
  'sentence2': 'The church is filled with song.',
  'sentence2_binary_parse': '( ( The church ) ( ( is ( filled ( with song ) ) ) . ) )',
  'sentence2_parse': '(ROOT (S (NP (DT The) (NN church)) (

In [7]:
# Entailment pairs that include numerical words in either one of the sentences
test_entailment_with_numericals = subset_with_numericals(test_data_entailment)
train_entailment_with_numericals = subset_with_numericals(train_data_entailment)
dev_entailment_with_numericals = subset_with_numericals(dev_data_entailment)

In [8]:
test_entailment_with_numericals

[{'annotator_labels': ['neutral',
   'entailment',
   'entailment',
   'entailment',
   'entailment'],
  'captionID': '4460943467.jpg#0',
  'gold_label': 'entailment',
  'pairID': '4460943467.jpg#0r1n',
  'sentence1': '3 young man in hoods standing in the middle of a quiet street facing the camera.',
  'sentence1_binary_parse': '( ( ( ( 3 ( young man ) ) ( in hoods ) ) ( ( standing ( in ( ( the middle ) ( of ( a ( quiet street ) ) ) ) ) ) ( facing ( the camera ) ) ) ) . )',
  'sentence1_parse': '(ROOT (NP (NP (NP (CD 3) (JJ young) (NN man)) (PP (IN in) (NP (NNS hoods)))) (VP (VBG standing) (PP (IN in) (NP (NP (DT the) (NN middle)) (PP (IN of) (NP (DT a) (JJ quiet) (NN street))))) (S (VP (VBG facing) (NP (DT the) (NN camera))))) (. .)))',
  'sentence2': 'Three hood wearing people pose for a picture.',
  'sentence2_binary_parse': '( ( ( Three hood ) ( wearing people ) ) ( ( pose ( for ( a picture ) ) ) . ) )',
  'sentence2_parse': '(ROOT (S (NP (NP (CD Three) (NN hood)) (VP (VBG wearing)

In [9]:
# Entailment pairs that include equal numerical words
test_entailment_with_same_numericals = subset_with_same_numericals(test_entailment_with_numericals)
train_entailment_with_same_numericals = subset_with_same_numericals(train_entailment_with_numericals)
dev_entailment_with_same_numericals = subset_with_same_numericals(dev_entailment_with_numericals)



In [10]:
test_entailment_with_same_numericals

[{'annotator_labels': ['entailment',
   'entailment',
   'entailment',
   'entailment',
   'entailment'],
  'captionID': '3020218156.jpg#2',
  'gold_label': 'entailment',
  'pairID': '3020218156.jpg#2r1e',
  'sentence1': 'A man standing in front of a building on the phone as two men to the side pain on the side.',
  'sentence1_binary_parse': '( ( ( A man ) ( ( standing ( in ( front ( of ( ( a building ) ( on ( ( the phone ) ( as ( two men ) ) ) ) ) ) ) ) ) ( to ( ( the ( side pain ) ) ( on ( the side ) ) ) ) ) ) . )',
  'sentence1_parse': '(ROOT (NP (NP (DT A) (NN man)) (VP (VBG standing) (PP (IN in) (NP (NP (NN front)) (PP (IN of) (NP (NP (DT a) (NN building)) (PP (IN on) (NP (NP (DT the) (NN phone)) (PP (IN as) (NP (CD two) (NNS men))))))))) (PP (TO to) (NP (NP (DT the) (NN side) (NN pain)) (PP (IN on) (NP (DT the) (NN side)))))) (. .)))',
  'sentence2': 'a guy near a building stands by two other men',
  'sentence2_binary_parse': '( ( ( a guy ) ( near ( a building ) ) ) ( stands ( by

In [11]:
def get_binary_pairs(list_of_dic):
    res = []
    for each in list_of_dic:
        res.append(
            (each['sentence1_binary_parse'], each['sentence2_binary_parse'])
        )
    return res
    

# binary_entailment_same_example = []
# for each in entailment_with_same_numericals:
#     binary_entailment_same_example.append(
#         (each['sentence1_binary_parse'], each['sentence2_binary_parse'])
#     )

In [12]:
test_binary_entailment_same_example = get_binary_pairs(test_entailment_with_same_numericals)
train_binary_entailment_same_example = get_binary_pairs(train_entailment_with_same_numericals)
dev_binary_entailment_same_example = get_binary_pairs(dev_entailment_with_same_numericals)

test_binary_entailment_same_example

[('( ( ( A man ) ( ( standing ( in ( front ( of ( ( a building ) ( on ( ( the phone ) ( as ( two men ) ) ) ) ) ) ) ) ) ( to ( ( the ( side pain ) ) ( on ( the side ) ) ) ) ) ) . )',
  '( ( ( a guy ) ( near ( a building ) ) ) ( stands ( by ( two ( other men ) ) ) ) )'),
 ('( ( ( ( A crowd ) ( of people ) ) ( ( looking up ) ( at ( ( 3 people ) ( on ( ( the edge ) ( of ( ( the roof ) ( of ( a building ) ) ) ) ) ) ) ) ) ) . )',
  "( ( ( The crowd ) ( on ( the ground ) ) ) ( ( is ( watching ( ( 3 people ) ( on ( ( the ( roof 's ) ) edge ) ) ) ) ) . ) )"),
 ('( ( ( Six or ) ( seven people ) ) ( ( are ( standing ( on ( ( ( ( a pier ) ( with ( a table ) ) ) and ) ( ( a pair ) ( of ( glasses ( in ( the foreground ) ) ) ) ) ) ) ) ) . ) )',
  '( ( ( Six or ) ( seven people ) ) ( ( are ( standing ( on ( a pier ) ) ) ) . ) )'),
 ('( ( A boy ) ( ( is ( ( riding down ) ( ( the road ) ( between ( two cows ) ) ) ) ) . ) )',
  '( ( ( ( A boy ) , ) ( ( ( a road ) and ) ( two cows ) ) ) . )'),
 ('( ( The 

In [13]:
numerical_word_in_sentence(test_binary_entailment_same_example[0][0])[0]

'two'

In [14]:
def generate_same_to_same_plural(binary_list, label):
    res = []
    for i in range(len(binary_list)):
        new_bin_pairs = utils.same_to_same_plural_number(
            binary_list[i],
            numerical_word_in_sentence(binary_list[i][0])[0]
        )
        for pair in new_bin_pairs:
            rem_bin_pair = utils.remove_parenthesis(pair)
            tmp_dict = dict()
            tmp_dict['gold_label'] = label
            tmp_dict['sentence1'] = rem_bin_pair[0]
            tmp_dict['sentence2'] = rem_bin_pair[1]
            tmp_dict['sentence1_binary_parse'] = pair[0]
            tmp_dict['sentence2_binary_parse'] = pair[1]
            res.append(tmp_dict)
    return res

In [15]:
test_entailment_same_to_same_plural = generate_same_to_same_plural(test_binary_entailment_same_example, 'entailment')
train_entailment_same_to_same_plural = generate_same_to_same_plural(train_binary_entailment_same_example, 'entailment')
dev_entailment_same_to_same_plural = generate_same_to_same_plural(dev_binary_entailment_same_example, 'entailment')


In [16]:
len(test_entailment_same_to_same_plural)

896

In [17]:
len(train_entailment_same_to_same_plural)

48480

In [18]:
len(dev_entailment_same_to_same_plural)

1056

In [19]:
import json
with open('test_entailment_same_to_same_plural.json', 'w') as outfile:
    json.dump(test_entailment_same_to_same_plural, outfile)

with open('train_entailment_same_to_same_plural.json', 'w') as outfile:
    json.dump(train_entailment_same_to_same_plural, outfile)
    
with open('dev_entailment_same_to_same_plural.json', 'w') as outfile:
    json.dump(dev_entailment_same_to_same_plural, outfile)