# Developing the cosine-based systems

In [1]:
from sklearn import preprocessing


def predict(inference_dict, threshold):
    """
    Calculates cosine similarity.
    
    :param inference_dict: inference dict to develop or test on
    :param threshold: threshold above which pairs are labaled as entailment
    return: list of predicted labels and list of gold labels
    """
    pred, gold = [], []
    for feats in inference_dict.values():
        t_tokens = feats['t'].split()
        h_tokens = feats['h'].split()

        # Convert e.g. 'volleybal' in t to 'bal' in t if 'bal' in h
        for h_token in h_tokens:
            for i, t_token in enumerate(t_tokens):
                if h_token != t_token and h_token in t_token:
                    t_tokens[i] = h_token
            
        # Create BoW
        th_tokens = set(t_tokens + h_tokens)
        t_vector = [t_tokens.count(th_token) for th_token in th_tokens]
        h_vector = [h_tokens.count(th_token) for th_token in th_tokens]
        t_vector = preprocessing.normalize([t_vector], norm='l2')
        h_vector = preprocessing.normalize([h_vector], norm='l2')
            
        # Calculate cosine similarity
        cosim = round(cosine_similarity(t_vector, h_vector).item(), 3)
        if ('geen' in t_tokens or 'niet' in t_tokens) and not \
          ('geen' in h_tokens or 'niet' in h_tokens) or \
          ('geen' in h_tokens or 'niet' in h_tokens) and not \
          ('geen' in t_tokens or 'niet' in t_tokens):
            cosim = 0
            
        # Make prediction
        if cosim >= threshold:
            pred.append("YES")
        else:
            pred.append("NO")
        gold.append(feats['entailment_label'])
        
    return pred, gold

In [2]:
import sys
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


def dev(dev_dict):
    """
    Reports accuracy on different thresholds for the cosine similarity on a dev set.
    
    :param dev_dict: inference dict to develop on
    :return: threshold where accuracy is highest and the corresponding accuracy
    """
    
    print("{:<15}{:<15}{}".format('threshold', 'accuracy', 'F1 (macro-averaged)'), file=sys.stderr)
    # Enter dev loop
    max_accuracy = 0
    for threshold in np.arange(0.00, 1.02, 0.01):
        threshold = round(threshold, 2)
        
        pred, gold = predict(dev_dict, threshold)
            
        accuracy = round(accuracy_score(gold, pred), 3)
        f1 = round(f1_score(gold, pred, average='macro'), 3)
        if accuracy > max_accuracy or (accuracy == max_accuracy) and (f1 > max_f1):
            max_accuracy = accuracy
            max_f1 = f1
            optimal_threshold = threshold

        print("{:<15}{:<15}{}".format(threshold, accuracy, f1), file=sys.stderr)
        
    return optimal_threshold, max_accuracy

In [3]:
from sklearn.metrics import classification_report

def evaluate(threshold, test_dict, extended=False):
    """
    Reports metrics of a given threshold for the cosine similarity on an inference test set.
    
    :param threshold: threshold above which pairs are labaled as entailment
    :param test_dict: dict to evaluate on
    :param extended: prints accuracy if False, full report if True
    """
    
    pred, gold = predict(test_dict, threshold)
    pred_yes = pred_no = 0
    for i in pred:
        if i == 'YES':
            pred_yes += 1
        else:
            pred_no += 1
    print("# of pairs predicted positive and negative respectively: ", pred_yes, ",", pred_no)

    if not extended:
        print("{:<15}{}".format('accuracy', 'F1 (macro-averaged)'))
        print("{:<15}{}".format(round(accuracy_score(gold, pred), 3), round(f1_score(gold, pred, average='macro'), 3)))
    else:
        print(classification_report(gold, pred, digits=3))

## Preprocessing SICK-NL and RTE-3
As performed by the `preprocess` function.

In [4]:
def preprocess(doc, nlp):
    """
    Removes punctuation and stopwords and lemmatises words.

    :param doc: document string
    :param nlp: SpaCy pipeline
    :return: the preprocessed document
    """
    
    nlp.Defaults.stop_words -= {"geen", "niet"}
    doc = nlp(doc)
    cleaned = []
    for token in doc:
        if not token.is_punct and token.lemma_ not in nlp.Defaults.stop_words:
            cleaned.append(token.lemma_)
    return " ".join(cleaned)

In [5]:
import pickle
import spacy

nlp = spacy.load('nl_core_news_lg')

### Preprocessing SICK-NL

In [6]:
dev_dict_sicknl = pickle.load(open("../data/sicknl/dev.p", "rb"))
test_dict_sicknl = pickle.load(open("../data/sicknl/test.p", "rb"))

for pair_id, feats in dev_dict_sicknl.items():
    dev_dict_sicknl[pair_id]['t'] = preprocess(feats['t'], nlp)
    dev_dict_sicknl[pair_id]['h'] = preprocess(feats['h'], nlp)
    
for pair_id, feats in test_dict_sicknl.items():
    test_dict_sicknl[pair_id]['t'] = preprocess(feats['t'], nlp)
    test_dict_sicknl[pair_id]['h'] = preprocess(feats['h'], nlp)

### Preprocessing RTE-3

In [7]:
dev_dict_rte3 = pickle.load(open("../data/rte3/dev.p", "rb"))
test_dict_rte3 = pickle.load(open("../data/rte3/test.p", "rb"))

for pair_id, feats in dev_dict_rte3.items():
    dev_dict_rte3[pair_id]['t'] = preprocess(feats['t'], nlp)
    dev_dict_rte3[pair_id]['h'] = preprocess(feats['h'], nlp)
    
for pair_id, feats in test_dict_rte3.items():
    test_dict_rte3[pair_id]['t'] = preprocess(feats['t'], nlp)
    test_dict_rte3[pair_id]['h'] = preprocess(feats['h'], nlp)

### Merging SICK-NL and RTE-3

In [8]:
# Handle conflicting keys
dev_dict_merged = dict()
length = 0
for feats in dev_dict_rte3.values():
    dev_dict_merged[length+1] = feats
    length += 1
for feats in dev_dict_sicknl.values():
    dev_dict_merged[length+1] = feats
    length += 1

# Handle conflicting keys
test_dict_merged = dict()
length = 0
for feats in test_dict_rte3.values():
    test_dict_merged[length+1] = feats
    length += 1
for feats in test_dict_sicknl.values():
    test_dict_merged[length+1] = feats
    length += 1

## Developing on SICK-NL

In [9]:
threshold, accuracy = dev(dev_dict_sicknl)

threshold      accuracy       F1 (macro-averaged)
0.0            0.5            0.333
0.01           0.626          0.571
0.02           0.626          0.571
0.03           0.626          0.571
0.04           0.626          0.571
0.05           0.626          0.571
0.06           0.626          0.571
0.07           0.626          0.571
0.08           0.626          0.571
0.09           0.626          0.571
0.1            0.626          0.571
0.11           0.627          0.572
0.12           0.628          0.573
0.13           0.629          0.575
0.14           0.629          0.575
0.15           0.632          0.58
0.16           0.634          0.584
0.17           0.639          0.591
0.18           0.641          0.594
0.19           0.651          0.608
0.2            0.651          0.608
0.21           0.66           0.621
0.22           0.66           0.622
0.23           0.671          0.638
0.24           0.675          0.643
0.25           0.675          0.644
0.26           

In [10]:
print('Optimal threshold: ', threshold)
print('Highest accuracy: ', accuracy)

Optimal threshold:  0.63
Highest accuracy:  0.785


### Evaluating on SICK-NL

In [11]:
evaluate(threshold, test_dict_sicknl, extended=True)

# of pairs predicted positive and negative respectively:  549 , 579
              precision    recall  f1-score   support

          NO      0.756     0.777     0.766       564
         YES      0.770     0.750     0.760       564

    accuracy                          0.763      1128
   macro avg      0.763     0.763     0.763      1128
weighted avg      0.763     0.763     0.763      1128



### Evaluating on RTE-3

In [12]:
evaluate(threshold, test_dict_rte3)

# of pairs predicted positive and negative respectively:  21 , 291
accuracy       F1 (macro-averaged)
0.542          0.436


### Evaluating on SICK-NL ∪ RTE-3

In [13]:
evaluate(threshold, test_dict_merged)

# of pairs predicted positive and negative respectively:  570 , 870
accuracy       F1 (macro-averaged)
0.715          0.712


## Developing on RTE-3

In [14]:
threshold, accuracy = dev(dev_dict_rte3)

threshold      accuracy       F1 (macro-averaged)
0.0            0.5            0.333
0.01           0.53           0.452
0.02           0.53           0.452
0.03           0.53           0.452
0.04           0.53           0.452
0.05           0.53           0.452
0.06           0.53           0.452
0.07           0.531          0.455
0.08           0.534          0.46
0.09           0.541          0.47
0.1            0.541          0.473
0.11           0.553          0.493
0.12           0.562          0.508
0.13           0.565          0.516
0.14           0.57           0.525
0.15           0.573          0.531
0.16           0.579          0.54
0.17           0.586          0.55
0.18           0.588          0.557
0.19           0.588          0.559
0.2            0.595          0.569
0.21           0.6            0.577
0.22           0.607          0.588
0.23           0.622          0.607
0.24           0.627          0.613
0.25           0.636          0.625
0.26           0.6

In [15]:
print('Optimal threshold: ', threshold)
print('Highest accuracy: ', accuracy)

Optimal threshold:  0.36
Highest accuracy:  0.637


### Evaluating on SICK-NL

In [16]:
evaluate(threshold, test_dict_sicknl)

# of pairs predicted positive and negative respectively:  804 , 324
accuracy       F1 (macro-averaged)
0.725          0.712


### Evaluating on RTE-3

In [17]:
evaluate(threshold, test_dict_rte3, extended=True)

# of pairs predicted positive and negative respectively:  138 , 174
              precision    recall  f1-score   support

          NO      0.615     0.686     0.648       156
         YES      0.645     0.571     0.605       156

    accuracy                          0.628       312
   macro avg      0.630     0.628     0.627       312
weighted avg      0.630     0.628     0.627       312



### Evaluating on SICK-NL ∪ RTE-3

In [18]:
evaluate(threshold, test_dict_merged)

# of pairs predicted positive and negative respectively:  942 , 498
accuracy       F1 (macro-averaged)
0.704          0.697


## Developing on SICK-NL ∪ RTE-3

In [19]:
threshold, accuracy = dev(dev_dict_merged)

threshold      accuracy       F1 (macro-averaged)
0.0            0.5            0.333
0.01           0.605          0.545
0.02           0.605          0.545
0.03           0.605          0.545
0.04           0.605          0.545
0.05           0.605          0.545
0.06           0.605          0.545
0.07           0.606          0.546
0.08           0.606          0.547
0.09           0.608          0.549
0.1            0.608          0.55
0.11           0.611          0.555
0.12           0.614          0.559
0.13           0.615          0.562
0.14           0.616          0.564
0.15           0.619          0.57
0.16           0.623          0.574
0.17           0.628          0.582
0.18           0.63           0.586
0.19           0.638          0.598
0.2            0.639          0.6
0.21           0.647          0.612
0.22           0.649          0.615
0.23           0.66           0.632
0.24           0.664          0.637
0.25           0.667          0.641
0.26           0.6

In [20]:
print('Optimal threshold: ', threshold)
print('Highest accuracy: ', accuracy)

Optimal threshold:  0.56
Highest accuracy:  0.732


### Evaluating on SICK-NL

In [21]:
evaluate(threshold, test_dict_sicknl)

# of pairs predicted positive and negative respectively:  628 , 500
accuracy       F1 (macro-averaged)
0.775          0.774


### Evaluating on RTE-3

In [22]:
evaluate(threshold, test_dict_rte3)

# of pairs predicted positive and negative respectively:  32 , 280
accuracy       F1 (macro-averaged)
0.532          0.444


### Evaluating on SICK-NL ∪ RTE-3

In [23]:
evaluate(threshold, test_dict_merged, extended=True)

# of pairs predicted positive and negative respectively:  660 , 780
              precision    recall  f1-score   support

          NO      0.705     0.764     0.733       720
         YES      0.742     0.681     0.710       720

    accuracy                          0.722      1440
   macro avg      0.724     0.722     0.722      1440
weighted avg      0.724     0.722     0.722      1440

