The purpose is to find entailment pairs that involve number reasoning (especially addition), but also classify them into all different kinds of cases we need to be careful about. 

When we change numbers in the discovered pairs, we know how to change the labels. For example, if we change the entailment pair with matching number so that the hypothesis contains more number, then this pair should not be an entailment anymore. In which cases should it be neutral, and in which cases should it be contradiction? Are there any cases when it remains entailment even after changing the numbers?

In [None]:
import src.utilities.utils as utils 
test_data = utils.read_jsonl('data/snli_1.0/snli_1.0_test.jsonl')

In [None]:
def convert_list(lst, convert_dict):
    # A function that change number words in a given list (lst) into digits. 
    # convert_dict contains mapping to change numeric words ("one", "1") into integer (1)
    
    # TODO: utils.convert_dict only contains numbers between 1-10. 
    #       if using more number words, than you must use a new dictionary that has all of those words.. 
    # TODO: Do the explorarion with more number words. 
    # TODO: What number words are included in SNLI training set?
    new_lst = []
    for each in lst:
        new_lst.append(convert_dict[each])
    return new_lst

def exists_intersection(found_first_list, found_second_list):
    # Compare the list of number words found from each sentence, return whether there's an intersection.
    # In order to match different words that refer to the same number (e.g. "one" and "1"),
    #    we first change them to digits using utils.convert_dict inside convert_list
    #    and then get intersection between lst1 and lst2
    lst1 = convert_list(found_first_list, utils.convert_dict)
    lst2 = convert_list(found_second_list, utils.convert_dict)
    return len([value for value in lst1 if value in lst2]) > 0

# TODO: change the following lists and add some more for uncertain quantifiers and other conditions
words_for_numbers = ['a', 'an', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten','1','2','3','4','5','6','7',
                    '8','9','10']
uncertain = ['group', 'some', 'most']

In [None]:
set([1,2,3])

In [None]:
def get_number_words_from_sentences(each):
    # search number words from two sentences from the given pair (each)
    found_first, found_second = False, False
    found_first_list, found_second_list = [], []
    sentence1_words = set(each['sentence1'].lower().split())
    sentence2_words = set(each['sentence2'].lower().split())
    for num_word in words_for_numbers:
        if num_word in sentence1_words:
            found_first = True
            found_first_list.append(num_word)
        if num_word in sentence2_words: 
            found_second = True
            found_second_list.append(num_word)
    # Exclude all pairs that contain uncertain words (TODO: Is this valid?)
    for num_word in uncertain:
        if num_word in sentence1_words:
            found_first = False
        if num_word in sentence2_words: 
            found_second = False
    return found_first, found_second, found_first_list, found_second_list

In [None]:
# FIND potential matching number without addition from entailment pairs

count = 0
for each in test_data:
    found_first, found_second, found_first_list, found_second_list = get_number_words_from_sentences(each)
    
    # If two sentences contain number words that refer to the same number, print that pair.
    # TODO: refine this rule to get more data.
    if found_first and found_second and exists_intersection(found_first_list, found_second_list) and 'entailment' == each['gold_label']:
        print(found_first_list, found_second_list, each['gold_label'])
        count += 1
        print(each['sentence1'])
        print(each['sentence2'])
        print(count) 

In [None]:
# FIND potential addition from entailment pairs

count = 0
for each in test_data:
    found_first, found_second, found_first_list, found_second_list = get_number_words_from_sentences(each)
            
    # If two sentences contain number words that refer to different numbers, print that pair.
    # TODO: refine this rule. This is excluding pairs that should be included.
    #       For example, there could be addition pairs with some common number word. 
    if found_first and found_second and not exists_intersection(found_first_list, found_second_list) and 'entailment' == each['gold_label']:
        print(found_first_list, found_second_list, each['gold_label'])
        count += 1
        print(each['sentence1'])
        print(each['sentence2'])
        print(count) 