# Preprocessing the SICK-NL corpus
This script preprocesses SICK-NL and serialises the result to the dev, train and test split.

## Inspecting the SICK-NL dataset

In [1]:
import csv

def get_original_distribution(sicknl_tsv):
    """
    Retrieves the distribution of entailment labels over 3 splits

    :param sicknl_tsv: path to the original SICK-NL tsv
    :return: the distribution of entailment labels over 3 splits
    """
    entailment_train = neutral_train = contradiction_train = \
    entailment_trial = neutral_trial = contradiction_trial = \
    entailment_test = neutral_test = contradiction_test = 0
    with open(sicknl_tsv, "r") as sicknl_tsv:
        reader = csv.reader(sicknl_tsv, delimiter="\t")
        next(reader)
        for row in reader:
            if row[3] == "ENTAILMENT":
                if row[-1] == "TRAIN":
                    entailment_train += 1
                elif row[-1] == "TRIAL":
                    entailment_trial += 1
                else:
                    entailment_test += 1
            elif row[3] == "NEUTRAL":
                if row[-1] == "TRAIN":
                    neutral_train += 1
                elif row[-1] == "TRIAL":
                    neutral_trial += 1
                else:
                    neutral_test += 1
            else:
                if row[-1] == "TRAIN":
                    contradiction_train += 1
                elif row[-1] == "TRIAL":
                    contradiction_trial += 1
                else:
                    contradiction_test += 1

    return entailment_train, neutral_train, contradiction_train, \
    entailment_trial, neutral_trial, contradiction_trial, \
    entailment_test, neutral_test, contradiction_test

In [2]:
get_original_distribution('sicknl.tsv')

(1274, 2524, 641, 143, 281, 71, 1404, 2790, 712)

A two-way task is pursued. SICK-NL contains more cases where entailment does not hold (7,019; labelled NEUTRAL or CONTRADICTION) than cases where entailment does hold (2821; labelled ENTAILMENT). Only sentences that have been labelled as being neutral bidrectionally will be preserved in an atttempt to balance the entailment labels.

In [3]:
def get_original_distribution(sicknl_tsv):
    """
    Retrieves the distribution of cases where entailment does and 
    does not hold from the original SICK-NL tsv.

    :param sicknl_tsv: path to the original SICK-NL tsv
    :return: the number of cases where entailment holds (yes) and does not hold (no)
    """
    yes = no = 0
    with open(sicknl_tsv, "r") as sicknl_tsv:
        reader = csv.reader(sicknl_tsv, delimiter="\t")
        next(reader)
        for row in reader:
            if row[3] == "ENTAILMENT":
                yes += 1
            elif row[5] == "A_neutral_B" and row[6] == "B_neutral_A":
                no += 1

    return yes, no

In [4]:
get_original_distribution('sicknl.tsv')

(2821, 4992)

However, this is not sufficient to balance the data. Additional steps will be needed.

## Converting the csv to a Python dictionary

In [5]:
from collections import defaultdict


def convert_to_dict(sicknl_tsv):
    """
    Converts the original SICK-NL csv to a Python dictionary, converting 
    the entailment label to a binary label (YES for ENTAILMENT and NO for 
    pairs that have been labelled NEUTRAL bidirectionally).
    
    :param sicknl_tsv: path to the original SICK-NL tsv
    :return: the SICK-NL dict
    """

    sicknl = defaultdict(dict)
    with open('sicknl.tsv', "r") as infile:
        reader = csv.reader(infile, delimiter="\t")
        header = next(reader)
        # Make header consistent with RTE-3
        header[1] = 't'
        header[2] = 'h'
        for row in reader:
            # Convert from three-way to two-way
            if row[3] == "ENTAILMENT":
                row[3] = "YES"
            elif row[5] == "A_neutral_B" and row[6] == "B_neutral_A":  
                row[3] = "NO"  # Only consider sentences agreed bidirectionally as being neutral
            else:
                continue

            # Convert dataset to dict
            for i, col in enumerate(row[1:]):
                sicknl[row[0]][header[i + 1]] = col
                
    return sicknl

In [6]:
sicknl = convert_to_dict('sicknl.tsv')

## Randomising the dataset to randomly skip sentence pairs

In [7]:
import random

def shuffle_sicknl(sicknl_dict):
    """
    Shuffles the SICK-NL dict so that sentence pairs can randomly be 
    skipped to balance the dataset.
    
    :param sicknl_dict: dict of SICK-NL
    :return: the shuffled SICK-NL dict
    """
    
    keys = list(sicknl_dict.keys())
    random.Random(1).shuffle(keys)
    return {key: sicknl_dict[key] for key in keys}

In [8]:
sicknl_shuffled = shuffle_sicknl(sicknl)

## Splitting the data into 3 sets

In [9]:
from math import ceil, floor

def make_splits(dev_split, train_split):
    """
    Divides the SICK-NL dict into two development sets and a test set where 
    each split consists of a 50-50 ratio between positive and negative pairs.
    
    :param dev_split: proportion of data to be assigned to the dev set
    :param train_split: proportion of data to be assigned to the train set
    :return: the SICK-NL dev, train and test split
    """
    
    orig_count_yes, orig_count_no = get_original_distribution('sicknl.tsv')
    dev_count_no = dev_count_yes = train_count_yes = train_count_no = test_count_no = 0
    dev = defaultdict(dict)
    train = defaultdict(dict)
    test = defaultdict(dict)
    for pair_id, feats in dict(sicknl_shuffled).items():
        if feats['entailment_label'] == "YES":
            if dev_count_yes < ceil(orig_count_yes / 100 * dev_split):
                dev[pair_id] = feats
                dev_count_yes += 1
            elif train_count_yes < floor(orig_count_yes / 100 * train_split):
                train[pair_id] = feats
                train_count_yes += 1
            else:
                test[pair_id] = feats
        else:  # NO
            if dev_count_no < ceil(orig_count_no / 100 * dev_split) and \
                     dev_count_no <= floor(orig_count_yes / 100 * dev_split):
                dev[pair_id] = feats
                dev_count_no += 1
            elif train_count_no < floor(orig_count_no / 100 * train_split) and \
                     train_count_no < floor(orig_count_yes / 100 * train_split):
                train[pair_id] = feats
                train_count_no += 1
            # Skip last negative sentence pairs to balance
            elif test_count_no < floor(orig_count_yes / 100 * (100 - dev_split-train_split)):
                test[pair_id] = feats
                test_count_no += 1
                
    return dev, train, test

In [10]:
dev, train, test = make_splits(60, 20)

## Verifying the splits

In [11]:
def get_new_distribution(sicknl_split):
    """
    Retrieves the distribution of cases where entailment does and 
    does not hold divided over the dev, train and test set.

    :param sicknl_split: a SICK-NL inference dict
    :return: the number of cases where entailment holds and does not hold
    """
    yes = no = 0
    for feats in sicknl_split.values():
        if feats['entailment_label'] == "YES":
            yes += 1
        else:  # NO
            no += 1

    return yes, no

In [12]:
print('dev:', get_new_distribution(dev))
print('train:', get_new_distribution(train))
print('test:', get_new_distribution(test))

dev: (1693, 1693)
train: (564, 564)
test: (564, 564)


## Geting average sentence length (in tokens)

In [13]:
import spacy


nlp = spacy.load('nl_core_news_lg')
token_c_t = token_c_h = pair_c = 0
for entry in {**dev, **train, **test}.values():
    token_c_t += len([token.text for sent in nlp(entry['t']).sents for token in sent if not token.is_punct])
    token_c_h += len([token.text for sent in nlp(entry['h']).sents for token in sent if not token.is_punct])
    pair_c += 1
    
print("Average t length:", round(token_c_t / pair_c, 1), "tokens")
print("Average h length:", round(token_c_h / pair_c, 1), "tokens")

Average t length: 9.1 tokens
Average h length: 8.8 tokens


## Serialising the splits

In [14]:
import pickle

pickle.dump(dict(dev), open("dev.p", "wb"))
pickle.dump(dict(train), open("train.p", "wb"))
pickle.dump(dict(test), open("test.p", "wb"))