# Developing the dependency-based systems

## Retrieving dependencies and tokens
Dependencies and tokens are added to the inference dicts for faster looping.

In [1]:
def append_dict(nlp, inference_dict):
    """
    Adds four key-value pairs to each pair in the inference_dict: for both
    documents 1) a lowered and tokenised version and 2) its dependencies.

    :param nlp: SpaCy pipeline
    :param inference_dict: an inference dict
    :return: the appended inference dict
    """
    for pair_id, entry in inference_dict.items():
        t_doc = nlp(entry['t'])
        h_doc = nlp(entry['h'])

        inference_dict[pair_id]['t_tokens'] = [token.text.lower() for sent in t_doc.sents for token in sent]
        inference_dict[pair_id]['h_tokens'] = [token.text.lower() for sent in h_doc.sents for token in sent]
        inference_dict[pair_id]['t_deps'] = [(token.lemma_, token.dep_, token.head.lemma_) for token in t_doc]
        inference_dict[pair_id]['h_deps'] = [(token.lemma_, token.dep_, token.head.lemma_) for token in h_doc]
                
    return(inference_dict)

In [2]:
import spacy
import pickle


nlp = spacy.load('nl_core_news_lg')

dev_dict_sicknl = pickle.load(open("../data/sicknl/dev.p", "rb"))
train_dict_sicknl = pickle.load(open("../data/sicknl/train.p", "rb"))
test_dict_sicknl = pickle.load(open("../data/sicknl/test.p", "rb"))
dev_dict_rte3 = pickle.load(open("../data/rte3/dev.p", "rb"))
train_dict_rte3 = pickle.load(open("../data/rte3/train.p", "rb"))
test_dict_rte3 = pickle.load(open("../data/rte3/test.p", "rb"))

# Handle duplicate keys while merging SICK-NL and RTE-3
length = 1
dev_dict_merged = dict()
for feats in dev_dict_sicknl.values():
    dev_dict_merged[length] = feats
    length += 1
for feats in dev_dict_rte3.values():
    dev_dict_merged[length] = feats
    length += 1
    
length = 1
train_dict_merged = dict()
for feats in train_dict_sicknl.values():
    train_dict_merged[length] = feats
    length += 1
for feats in train_dict_rte3.values():
    train_dict_merged[length] = feats
    length += 1
    
length = 1
test_dict_merged = dict()
for feats in test_dict_sicknl.values():
    test_dict_merged[length] = feats
    length += 1
for feats in test_dict_rte3.values():
    test_dict_merged[length] = feats
    length += 1
    
dev_dict_sicknl = append_dict(nlp, dev_dict_sicknl)
train_dict_sicknl = append_dict(nlp, train_dict_sicknl)
test_dict_sicknl = append_dict(nlp, test_dict_sicknl)
dev_dict_rte3 = append_dict(nlp, dev_dict_rte3)
train_dict_rte3 = append_dict(nlp, train_dict_rte3)
test_dict_rte3 = append_dict(nlp, test_dict_rte3)
dev_dict_merged = append_dict(nlp, dev_dict_merged)
train_dict_merged = append_dict(nlp, train_dict_merged)
test_dict_merged = append_dict(nlp, test_dict_merged)

## Building three development algorithms
One for developing manual rules, one for developing automated rules and one for preventing overfitting.

### Defining possible overlap paths
Matching rules consist of two comparisons: both between 1) the lemmas and 2) the relations. First, 6 functions are build to define 6 possible paths:

In [3]:
"""
6 paths to compare words. Note that a match is positive if a word in the hypothesis is
a substring of the word in the text (e.g. volleybal = bal).

:param dep_t: one of the dependency triplets in the text (tuple)
:param dep_h: one of the dependency triplets in the hypothesis (tuple)
:return: True if it is a match, False otherwise
"""
def linear_full_overlap(dep_t, dep_h):
    """Checks whether both the head and dependent of the triplets match."""
    return (dep_h[0] in dep_t[0]) and (dep_h[2] in dep_t[2])


def linear_head_overlap(dep_t, dep_h):
    """Checks whether the heads of the triplets match."""
    return dep_h[2] in dep_t[2]


def linear_dependent_overlap(dep_t, dep_h):
    """Checks whether the dependents of the triplets match."""
    return dep_h[0] in dep_t[0]


def cross_full_overlap(dep_t, dep_h):
    """Checks whether the dependent and head in the hypthesis triplet respectively 
    matches the head and dependent in the text triplet."""
    return (dep_h[2] in dep_t[0]) and (dep_h[0] in dep_t[2])


def cross_partial_overlap_1(dep_t, dep_h):
    """Checks whether the dependent in the text triplet matches the head in the hypthesis triplet."""
    return dep_h[2] in dep_t[0]


def cross_partial_overlap_2(dep_t, dep_h):
    """Checks whether the head in the text triplet matches the dependent in the hypthesis triplet."""
    return dep_h[0] in dep_t[2]

### Applying rules
Subsequently, a function that combines both comparisons is build. The relations to compare are yet to be defined.

In [4]:
def apply_rule(t_deps, dep_h, rel_rels, overlap_func):
    """
    Checks whether the relations match and the words match according to the specified overlap path.
    
    :param t_deps: a list of all dependency triples (tuples) in the text
    :parm dep_h: the current (i.e. to be checked) dependency triple (tuple) in the hypothesis
    :param rel_rels: order independent list of two lists containing relevant relations 
                    (e.g. [['nmod'], ['obj', 'nsubj']])
    :param overlap_func: func of the corresponding overlap pattern
    :return: True if a match is found, False otherwise
    """
    # one direction, e.g. H(Y, 'nmod', X) and and T(X, 'obj', Y)
    if dep_h[1] in rel_rels[0]:
        for dep_t in t_deps:
            if dep_t[1] in rel_rels[1] and overlap_func(dep_t, dep_h):
                    return True
    
    # opposite direction, e.g. H(X, 'obj', Y) and and T(Y, 'nmod', X)
    elif dep_h[1] in rel_rels[1]:
        for dep_t in t_deps:
            if dep_t[1] in rel_rels[0] and overlap_func(dep_t, dep_h):
                return True

    else:
        return False

### Preprocessing triplets
Several processing steps have been tested that could increase accuracy for both corpora. Only removing relations has been preserved.

In [5]:
def process_deps(entry):
    """
    Processes the appended inference dict by removing relations that lower accuracy.
    
    :param entry: value in the inference dict
    :return: the processed dependencies for both the text and hypothesis
    """

    # Remove insignificant rels
    insignificant = ('ROOT', 'aux:pass', 'aux', 'punct')
    t_deps = [dep_t for dep_t in entry['t_deps'] if dep_t[1] not in insignificant]
    h_deps = [dep_h for dep_h in entry['h_deps'] if dep_h[1] not in insignificant]
                    
    return t_deps, h_deps

### Building a development function for manual rules

In [6]:
import itertools
import numpy as np
from sklearn.metrics import accuracy_score
import sys


def dev_base_rules(rules, dev_dict):
    """
    Varies the score threshold to obtain the optimal accuracy for a given set of rules.
    
    :param rules: list where list[0] is an order independent list of two lists containing equivalent relations 
                  (e.g. [['nmod'], ['obj', 'nsubj']]) and list[1] is the func of the corresponding overlap pattern
    :param dev_dict: appended inference dict
    :return: the optimal threshold and its corresponding absolute and relative accuracy
    """
    scores, gold = [], []
    for entry in dev_dict.values():
        score = 0
        t_deps, h_deps = process_deps(entry)

        # Apply rules
        for dep_h in h_deps:
            # Apply custom rule 4 (negation)
            if ('geen' in entry['t_tokens'] or 'niet' in entry['t_tokens']) and not \
               ('geen' in entry['h_tokens'] or 'niet' in entry['h_tokens']) or \
               ('geen' in entry['h_tokens'] or 'niet' in entry['h_tokens']) and not \
               ('geen' in entry['t_tokens'] or 'niet' in entry['t_tokens']):
                score = 0
                break
                
            # Apply custom rule 3 (aan het ..)
            if 'aan het' in ' '.join(entry['t_tokens']) or 'aan het' in ' '.join(entry['h_tokens']):
                if apply_rule(t_deps, dep_h, [['nmod'], ['obj']], cross_full_overlap):
                    score += 1
                    continue
            
            # Apply manual rules
            for rule in rules:
                if apply_rule(t_deps, dep_h, rule[0], rule[1]):
                    score += rule[2]
                    break

        scores.append(score / (len(h_deps)))
        gold.append(entry['entailment_label'])

    # Get optimal threshold
    max_accuracy_sys = 0
    for threshold in np.arange(0.00, 1.02, 0.01):
        threshold = round(threshold, 2)
        pred = []
        for score in scores:
            if round(score, 3) >= threshold:
                pred.append("YES")
            else:
                pred.append("NO")

        max_accuracy_rule = round(accuracy_score(gold, pred), 4)
        if max_accuracy_rule > max_accuracy_sys:
            optimal_threshold_sys = threshold
            max_accuracy_sys = max_accuracy_rule
            max_correct_sys = accuracy_score(gold, pred, normalize=False)
    
    return optimal_threshold_sys, max_accuracy_sys, max_correct_sys

In [7]:
# Define all relevant relations to make all possible pairs (rule 1)
relations = ['obl', 'acl:relcl', 'expl:pv', 'advmod', 'parataxis', 'conj', 'advcl', 'appos', 
             'ccomp', 'nummod', 'obl:agent', 'nmod', 'fixed', 'compound:prt', 'nsubj:pass', 
             'xcomp', 'csubj', 'acl', 'nmod:poss', 'nsubj', 'obj', 'cc', 'case', 'cop', 
             'iobj', 'mark', 'det', 'amod', 'flat']  

manual_rules = (
    # Rule 1 (perfect overlap)
    [[[[rel], [rel]], linear_full_overlap, 1.0] for rel in relations] + 
    [ # Rule 2 (passive)
     [[['nsubj:pass'], ['obj', 'obl']], linear_full_overlap, 1.0],
     [[['nsubj:pass'], ['nmod']], cross_full_overlap, 1.0],
     [[['obl:agent'], ['nsubj']], linear_full_overlap, 1.0],
     # Rule 3 (aan het ..) in function
     # Rule 4 (negation) in function
    ]
)

threshold_sicknl, accuracy_sicknl, correct_sicknl = dev_base_rules(manual_rules, dev_dict_sicknl)
threshold_rte3, accuracy_rte3, correct_rte3 = dev_base_rules(manual_rules, dev_dict_rte3)
threshold_merged, accuracy_merged, correct_merged = dev_base_rules(manual_rules, dev_dict_merged)
print('Highest accuracy SICK-NL:', accuracy_sicknl, 'given score threshold:', threshold_sicknl)
print('Highest accuracy RTE-3:', accuracy_rte3, 'given score threshold:', threshold_rte3)
print('Highest accuracy SICK-NL ∪ RTE-3:', accuracy_merged, 'given score threshold:', threshold_merged)

Highest accuracy SICK-NL: 0.7791 given score threshold: 0.32
Highest accuracy RTE-3: 0.6094 given score threshold: 0.17
Highest accuracy SICK-NL ∪ RTE-3: 0.7413 given score threshold: 0.28


### Building a development function for automated rules

In [8]:
def dev_addit_rules(relations, verified_rules, base_accuracy, base_correct, dev_dict):
    """
    Varies the score threshold to obtain the optimal accuracy for a given set of rules that consist of
    1) the manual developed rules
    2) the already proven automated rules (if any), and
    3) the new automated rule to be tested.
    
    :param relations: considered dependency relations
    :param verified_rules: manually selected rules
    :param base_accuracy: relative accuracy of the manual rules on dev (on which to improve on)
    :param base_correct: absolute accuracy of the manual rules on dev (on which to improve on)
    :param dev_dict: appended inference dev dict
    :return: the appended set of rules, new optimal threshold and relative accuracy
    """
    
    # Make all possible relation pairs for each overlap function
    relation_pairs = [[i, i] for i in relations] + list(map(list, itertools.combinations(relations, 2)))
    overlap_funcs = (linear_head_overlap, linear_dependent_overlap, cross_partial_overlap_1, 
                     cross_partial_overlap_2, linear_full_overlap, cross_full_overlap)
    progress = 0
    max_corr_sys = base_correct
    max_accuracy_sys = base_accuracy
    for rule in relation_pairs:
        rule = [[rule[0]], [rule[1]]]
        max_accuracy_rule = 0
        max_corr_rule = 0
        for overlap_func in overlap_funcs:
            for weight in np.arange(0.1, 1.1, 0.1):
                progress += 1
                weight = round(weight, 1)
                scores, gold = [], []
                for entry in dev_dict.values():
                    score = 0
                    t_deps, h_deps = process_deps(entry)

                    # Apply rules
                    for dep_h in h_deps:
                        # Apply custom rule 4 (negation)
                        if ('geen' in entry['t_tokens'] or 'niet' in entry['t_tokens']) and not \
                           ('geen' in entry['h_tokens'] or 'niet' in entry['h_tokens']) or \
                           ('geen' in entry['h_tokens'] or 'niet' in entry['h_tokens']) and not \
                           ('geen' in entry['t_tokens'] or 'niet' in entry['t_tokens']):
                            score = 0
                            break

                        # Apply custom rule 3 (aan het ..)
                        if 'aan het' in ' '.join(entry['t_tokens']) or 'aan het' in ' '.join(entry['h_tokens']):
                            if apply_rule(t_deps, dep_h, [['nmod'], ['obj']], cross_full_overlap):
                                score += 1
                                continue

                        # Apply found rules
                        for found_rule in verified_rules:
                            if apply_rule(t_deps, dep_h, found_rule[0], found_rule[1]):
                                score += found_rule[2]
                                break
                                
                        # Add current rule
                        if apply_rule(t_deps, dep_h, rule, overlap_func):
                            score += weight

                    scores.append(score / (len(h_deps)))
                    gold.append(entry['entailment_label'])

                # Enter dev loop
                for threshold in np.arange(0.2, 0.7, 0.01):  # Increase speed by using a smaller interval
                    pred = []
                    threshold = round(threshold, 2)
                    for score in scores:
                        # Predict labels
                        if round(score, 3) >= threshold:
                            pred.append("YES")
                        else:
                            pred.append("NO")

                    # Save additional rule if it increases accuracy
                    accuracy_rule = round(accuracy_score(gold, pred), 4)
                    if accuracy_rule >= max_accuracy_rule:
                        optimal_weight = weight
                        optimal_threshold_rule = threshold
                        max_accuracy_rule = accuracy_rule
                        max_correct_rule = accuracy_score(gold, pred, normalize=False)
                        improving_rule = [rule] + [overlap_func] + [optimal_weight]
                        
                # Print progress
                if (progress / 1000.0).is_integer():
                    print('Completed rule', progress, 'of', len(relation_pairs*6*10), file=sys.stderr)
        # Check if rule increases accuracy
        if max_accuracy_rule > max_accuracy_sys:
            verified_rules.append(improving_rule)
            max_accuracy_sys = max_accuracy_rule
            optimal_threshold_sys = optimal_threshold_rule
            print("Rule {} {} {} obtained {} addditional correct pair(s) (acc: {}, T: {})".format(improving_rule[0], improving_rule[1].__name__, improving_rule[2], 
                         max_correct_rule - max_corr_sys, max_accuracy_sys, optimal_threshold_sys), file=sys.stderr)
            verified_rules[-1].append(max_correct_rule - max_corr_sys)
            max_corr_sys = max_correct_rule
    
    return(verified_rules, optimal_threshold_sys, max_accuracy_sys)

### Developing on SICK-NL, RTE-3 and the merged ../dataset

In [9]:
rules_sicknl, threshold_sicknl, accuracy_sicknl = dev_addit_rules(relations, manual_rules.copy(), accuracy_sicknl, correct_sicknl, dev_dict_sicknl)

Rule [['obl'], ['obl']] cross_partial_overlap_2 1.0 obtained 2 addditional correct pair(s) (acc: 0.7797, T: 0.33)
Rule [['advmod'], ['advmod']] linear_dependent_overlap 1.0 obtained 1 addditional correct pair(s) (acc: 0.78, T: 0.33)
Rule [['conj'], ['conj']] cross_partial_overlap_2 1.0 obtained 3 addditional correct pair(s) (acc: 0.7809, T: 0.33)
Rule [['nummod'], ['nummod']] linear_dependent_overlap 0.9 obtained 3 addditional correct pair(s) (acc: 0.7817, T: 0.33)
Completed rule 1000 of 26100
Rule [['obj'], ['obj']] linear_dependent_overlap 0.8 obtained 3 addditional correct pair(s) (acc: 0.7826, T: 0.37)
Completed rule 2000 of 26100
Rule [['obl'], ['obj']] linear_dependent_overlap 0.9 obtained 3 addditional correct pair(s) (acc: 0.7835, T: 0.37)
Completed rule 3000 of 26100
Rule [['obl'], ['case']] linear_head_overlap 1.0 obtained 10 addditional correct pair(s) (acc: 0.7865, T: 0.37)
Rule [['obl'], ['cop']] cross_partial_overlap_2 1.0 obtained 3 addditional correct pair(s) (acc: 0.78

In [10]:
rules_rte3, threshold_rte3, accuracy_rte3 = dev_addit_rules(relations, manual_rules.copy(), accuracy_rte3, correct_rte3, dev_dict_rte3)

Rule [['conj'], ['conj']] cross_full_overlap 1.0 obtained 1 addditional correct pair(s) (acc: 0.6105, T: 0.2)
Rule [['advcl'], ['advcl']] linear_dependent_overlap 1.0 obtained 1 addditional correct pair(s) (acc: 0.6116, T: 0.2)
Rule [['appos'], ['appos']] linear_dependent_overlap 1.0 obtained 1 addditional correct pair(s) (acc: 0.6127, T: 0.2)
Rule [['nummod'], ['nummod']] linear_dependent_overlap 0.9 obtained 2 addditional correct pair(s) (acc: 0.6148, T: 0.22)
Rule [['nmod'], ['nmod']] linear_dependent_overlap 0.1 obtained 1 addditional correct pair(s) (acc: 0.6159, T: 0.23)
Rule [['nsubj:pass'], ['nsubj:pass']] linear_dependent_overlap 1.0 obtained 1 addditional correct pair(s) (acc: 0.617, T: 0.2)
Completed rule 1000 of 26100
Rule [['obj'], ['obj']] linear_dependent_overlap 1.0 obtained 2 addditional correct pair(s) (acc: 0.6191, T: 0.23)
Rule [['case'], ['case']] linear_head_overlap 0.7 obtained 9 addditional correct pair(s) (acc: 0.6288, T: 0.27)
Rule [['flat'], ['flat']] linear_

In [11]:
rules_merged, threshold_merged, accuracy_merged = dev_addit_rules(relations, manual_rules.copy(), accuracy_merged, correct_merged, dev_dict_merged)

Rule [['obl'], ['obl']] cross_partial_overlap_1 1.0 obtained 2 addditional correct pair(s) (acc: 0.7418, T: 0.28)
Rule [['acl:relcl'], ['acl:relcl']] linear_head_overlap 1.0 obtained 1 addditional correct pair(s) (acc: 0.742, T: 0.28)
Rule [['conj'], ['conj']] cross_partial_overlap_1 0.7 obtained 3 addditional correct pair(s) (acc: 0.7427, T: 0.28)
Rule [['advcl'], ['advcl']] linear_dependent_overlap 1.0 obtained 1 addditional correct pair(s) (acc: 0.7429, T: 0.28)
Rule [['appos'], ['appos']] linear_full_overlap 0.7 obtained 1 addditional correct pair(s) (acc: 0.7432, T: 0.28)
Rule [['nummod'], ['nummod']] linear_dependent_overlap 0.5 obtained 2 addditional correct pair(s) (acc: 0.7436, T: 0.28)
Completed rule 1000 of 26100
Rule [['nmod:poss'], ['nmod:poss']] linear_full_overlap 0.9 obtained 1 addditional correct pair(s) (acc: 0.7439, T: 0.28)
Rule [['flat'], ['flat']] linear_dependent_overlap 0.9 obtained 7 addditional correct pair(s) (acc: 0.7455, T: 0.32)
Completed rule 2000 of 2610

In [13]:
print('Highest accuracy SICK-NL:', accuracy_sicknl, 'given score threshold:', threshold_sicknl)
print('Highest accuracy RTE-3:', accuracy_rte3, 'given score threshold:', threshold_rte3)
print('Highest accuracy SICK-NL ∪ RTE-3:', accuracy_merged, 'given score threshold:', threshold_merged)
print()
print('Number of preliminary additional rules for SICK-NL:', len(rules_sicknl) - len(manual_rules))
print('Number of preliminary additional rules for RTE-3:', len(rules_rte3) - len(manual_rules))
print('Number of preliminary additional rules for SICK-NL ∪ RTE-3:', len(rules_merged) - len(manual_rules))

Highest accuracy SICK-NL: 0.8054 given score threshold: 0.37
Highest accuracy RTE-3: 0.6545 given score threshold: 0.28
Highest accuracy SICK-NL ∪ RTE-3: 0.7557 given score threshold: 0.32

Number of preliminary additional rules for SICK-NL: 37
Number of preliminary additional rules for RTE-3: 19
Number of preliminary additional rules for SICK-NL ∪ RTE-3: 23


### Finding and applying rule threshold

In [14]:
def find_rule_threshold(rules, score_threshold, train_dict):
    """
    Varies the rule threshold from which to include rules to obtain the optimal accuracy on train 
    to prevent overfitting.
    
    :param rules: manually selected appended by automatically selected rules
    :param score_threshold: optimal score threshold on dev
    :param train_dict: appended inference train dict
    :return: the optimal threshold for the rules and the relative accuracy
    """
    max_accuracy_sys = 0
    for rule_threshold in range(1, 10):
        scores, gold = [], []
        for entry in train_dict.values():
            score = 0
            t_deps, h_deps = process_deps(entry)

            # Apply rules
            for dep_h in h_deps:
                # Apply custom rule 4 (negation)
                if ('geen' in entry['t_tokens'] or 'niet' in entry['t_tokens']) and not \
                  ('geen' in entry['h_tokens'] or 'niet' in entry['h_tokens']) or \
                  ('geen' in entry['h_tokens'] or 'niet' in entry['h_tokens']) and not \
                  ('geen' in entry['t_tokens'] or 'niet' in entry['t_tokens']):
                    score = 0
                    break
                    
                # Apply custom rule 3 (aan het ..)
                if 'aan het' in ' '.join(entry['t_tokens']) or 'aan het' in ' '.join(entry['h_tokens']):
                    if apply_rule(t_deps, dep_h, [['nmod'], ['obj']], cross_full_overlap):
                        score += 1
                        continue
                
                # Apply all applicable rules
                for rule in rules:
                    if len(rule) == 4:
                        if rule[3] >= rule_threshold:
                            if apply_rule(t_deps, dep_h, rule[0], rule[1]):
                                score += rule[2]
                                break
                    else:
                        if apply_rule(t_deps, dep_h, rule[0], rule[1]):
                            score += rule[2]
                            break

            scores.append(score / (len(h_deps)))
            gold.append(entry['entailment_label'])

        score_threshold = round(score_threshold, 2)
        pred = []
        for score in scores:
            if round(score, 3) >= score_threshold:
                pred.append("YES")
            else:
                pred.append("NO")

        # Check if threshold increases accuracy
        max_accuracy_threshold_rule = round(accuracy_score(gold, pred), 4)
        if max_accuracy_threshold_rule > max_accuracy_sys:
            optimal_threshold_rule = rule_threshold
            max_accuracy_sys = max_accuracy_threshold_rule
                    
    return optimal_threshold_rule, max_accuracy_sys

In [15]:
sicknl_threshold_rules, sicknl_accuracy_train = find_rule_threshold(rules_sicknl, threshold_sicknl, train_dict_sicknl)
rte3_threshold_rules, rte3_accuracy_train = find_rule_threshold(rules_rte3, threshold_rte3, train_dict_rte3)
merged_threshold_rules, merged_accuracy_train = find_rule_threshold(rules_merged, threshold_merged, train_dict_merged)

print('Highest accuracy SICK-NL:', sicknl_accuracy_train)
print('Highest accuracy RTE-3:', rte3_accuracy_train)
print('Highest accuracy SICK-NL ∪ RTE-3:', merged_accuracy_train)

Highest accuracy SICK-NL: 0.7846
Highest accuracy RTE-3: 0.6506
Highest accuracy SICK-NL ∪ RTE-3: 0.7514


In [16]:
updated_rules_sicknl = []
for rule in rules_sicknl:
    if len(rule) == 4:
        if rule[3] >= sicknl_threshold_rules:
            updated_rules_sicknl.append(rule[:3])
    else:
        updated_rules_sicknl.append(rule[:3])

updated_rules_rte3 = []
for rule in rules_rte3:
    if len(rule) == 4:
        if rule[3] >= rte3_threshold_rules:
            updated_rules_rte3.append(rule[:3])
    else:
        updated_rules_rte3.append(rule[:3])
        
updated_rules_merged = []
for rule in rules_merged:
    if len(rule) == 4:
        if rule[3] >= merged_threshold_rules:
            updated_rules_merged.append(rule[:3])
    else:
        updated_rules_merged.append(rule[:3]) 

In [17]:
# Save backup
pickle.dump(updated_rules_sicknl, open("../data/rules/sicknl.p", "wb"))
pickle.dump(updated_rules_rte3, open("../data/rules/rte3.p", "wb"))
pickle.dump(updated_rules_merged, open("../data/rules/merged.p", "wb"))

In [18]:
print('Number of definitive additional rules for SICK-NL:', len(updated_rules_sicknl) - len(manual_rules), 'given rule threshold:', sicknl_threshold_rules)
print('Number of definitive additional rules for RTE-3:', len(updated_rules_rte3) - len(manual_rules), 'given rule threshold:', rte3_threshold_rules)
print('Number of definitive additional rules for SICK-NL ∪ RTE-3:', len(updated_rules_merged) - len(manual_rules), 'given rule threshold:', merged_threshold_rules)

Number of definitive additional rules for SICK-NL: 37 given rule threshold: 1
Number of definitive additional rules for RTE-3: 6 given rule threshold: 3
Number of definitive additional rules for SICK-NL ∪ RTE-3: 23 given rule threshold: 1


### Evaluating

In [19]:
from sklearn.metrics import classification_report

def eval_system(rules, threshold, test_dict, extended=False):
    """
    Reports accuracy of a given threshold for the system on a test set.
    
    :param rules: list where list [0] is an order independent list of two lists containing equivalent relations 
                  (e.g. [['nmod'], ['obj', 'nsubj']]) and list[1] is the func of the corresponding overlap pattern
    :param threshold: threshold above which pairs are labaled as entailment
    :param test_dict: inference dict to evaluate on
    :param extended: prints accuracy if False, full report if True
    :return: the relative and absolute accuracy
    """
    scores, gold = [], []
    for entry in test_dict.values():
        score = 0
        t_deps, h_deps = process_deps(entry)

        # Apply rules
        for dep_h in h_deps:
            # Apply custom rule 4 (negation)
            if ('geen' in entry['t_tokens'] or 'niet' in entry['t_tokens']) and not \
               ('geen' in entry['h_tokens'] or 'niet' in entry['h_tokens']) or \
               ('geen' in entry['h_tokens'] or 'niet' in entry['h_tokens']) and not \
               ('geen' in entry['t_tokens'] or 'niet' in entry['t_tokens']):
                score = 0
                break
                
            # Apply custom rule 3 (aan het ..)
            if 'aan het' in ' '.join(entry['t_tokens']) or 'aan het' in ' '.join(entry['h_tokens']):
                if apply_rule(t_deps, dep_h, [['nmod'], ['obj']], cross_full_overlap):
                    score += 1
                    continue
            
            # Apply manual rules
            for rule in rules:
                if apply_rule(t_deps, dep_h, rule[0], rule[1]):
                    score += rule[2]
                    break

        scores.append(score / (len(h_deps)))
        gold.append(entry['entailment_label'])

    # Evaluate
    pred = []
    for score in scores:
        if round(score, 3) >= threshold:
            pred.append("YES")
        else:
            pred.append("NO")
    
    if not extended:
        print("accuracy: ", round(accuracy_score(gold, pred), 3))
    else:
        print(classification_report(gold, pred, digits=3))

In [20]:
eval_system(updated_rules_sicknl, threshold_sicknl, test_dict_sicknl, extended=True)
eval_system(updated_rules_sicknl, threshold_sicknl, test_dict_rte3)
eval_system(updated_rules_sicknl, threshold_sicknl, test_dict_merged)

              precision    recall  f1-score   support

          NO      0.786     0.764     0.775       564
         YES      0.771     0.793     0.781       564

    accuracy                          0.778      1128
   macro avg      0.779     0.778     0.778      1128
weighted avg      0.779     0.778     0.778      1128

accuracy:  0.58
accuracy:  0.735


In [21]:
eval_system(updated_rules_rte3, threshold_rte3, test_dict_sicknl)
print()
eval_system(updated_rules_rte3, threshold_rte3, test_dict_rte3, extended=True)
eval_system(updated_rules_rte3, threshold_rte3, test_dict_merged)

accuracy:  0.754

              precision    recall  f1-score   support

          NO      0.580     0.673     0.623       156
         YES      0.611     0.513     0.557       156

    accuracy                          0.593       312
   macro avg      0.595     0.593     0.590       312
weighted avg      0.595     0.593     0.590       312

accuracy:  0.719


In [22]:
eval_system(updated_rules_merged, threshold_merged, test_dict_sicknl)
eval_system(updated_rules_merged, threshold_merged, test_dict_rte3)
print()
eval_system(updated_rules_merged, threshold_merged, test_dict_merged, extended=True)

accuracy:  0.762
accuracy:  0.574

              precision    recall  f1-score   support

          NO      0.727     0.710     0.718       720
         YES      0.716     0.733     0.725       720

    accuracy                          0.722      1440
   macro avg      0.722     0.722     0.721      1440
weighted avg      0.722     0.722     0.721      1440

