# Preprocessing the Dutch RTE-3 corpus
This script preprocesses RTE-3 and serialises the result to the dev, train and test split.

## Inspecting the RTE-3 dataset

In [1]:
def get_original_distribution(rte3_txt, split):
    """
    Retrieves the distribution of the RTE-3 txt.

    :param rte3_txt: path to the translated RTE-3 txt
    :param split: denotes which split to retrieve the distribution of, either "dev" or "test"
    :return: dict denoting the distribution of the entailment and length label divided over the four NLP tasks
    """
    tasks = {'IE': {'yes': 0, 'no': 0, 'short': 0, 'long': 0},
             'QA': {'yes': 0, 'no': 0, 'short': 0, 'long': 0},
             'IR': {'yes': 0, 'no': 0, 'short': 0, 'long': 0},
             'SUM': {'yes': 0, 'no': 0, 'short': 0, 'long': 0}}
    with open(rte3_txt, "r") as rte3:
        i = 1
        for line in rte3:
            # Process RTE-3 in chunks of 4 lines
            if i == 1:
                feats = line.split()
                if feats[1] == split:
                    task = feats[4]
                    if feats[2] == "YES":
                        tasks[task]['yes'] = tasks[task]['yes'] + 1
                    else:
                        tasks[task]['no'] = tasks[task]['no'] + 1
                    if feats[5] == 'short':
                        tasks[task]['short'] = tasks[task]['short'] + 1
                    else:
                        tasks[task]['long'] = tasks[task]['long'] + 1
            elif i == 4:
                i = 0
            i += 1

    return tasks

In [2]:
distribution = get_original_distribution('rte3.txt', 'dev')
original_dev_yes = distribution['IE']['yes'] + distribution['QA']['yes'] + distribution['IR']['yes'] + distribution['SUM']['yes']
original_dev_no = distribution['IE']['no'] + distribution['QA']['no'] + distribution['IR']['no'] + distribution['SUM']['no']
print(original_dev_yes, original_dev_no)

distribution = get_original_distribution('rte3.txt', 'test')
original_test_yes = distribution['IE']['yes'] + distribution['QA']['yes'] + distribution['IR']['yes'] + distribution['SUM']['yes']
original_test_no = distribution['IE']['no'] + distribution['QA']['no'] + distribution['IR']['no'] + distribution['SUM']['no']
print(original_test_yes, original_test_no)

orig_count_no = original_dev_no + original_test_no

412 388
410 390


RTE-3 contains more cases where entailment holds (822) than cases where entailment does not hold (778). 11 entailment cases per tasks will be removed to balance the entailment labels.

## Converting the csv to a Python dictionary

In [3]:
from collections import defaultdict


def convert_to_dict(rte3_txt):
    """
    Converts the translated RTE-3 txt to a Python dictionary.
    
    :param srte3_txt: path to the translated RTE-3 txt
    :return: the RTE-3 dict
    """
    
    rte3 = defaultdict(dict)
    # Build dict with all entries.
    with open('rte3.txt', "r") as infile:
        chunk_i = 1
        for line in infile:
            # Process RTE-3 in chunks of 4 lines
            if chunk_i == 1:
                feats = line.split()
                # The key is artifically created.
                if feats[0] not in rte3:
                    id = feats[0]
                else:
                    id = int(feats[0]) + 1
                rte3[id]['set'] = feats[1]
                rte3[id]['entailment_label'] = feats[2]
                rte3[id]['task'] = feats[4]
                rte3[id]['length'] = feats[5]
            elif chunk_i == 2:
                rte3[id]['h'] = line.rstrip()
            elif chunk_i == 3:
                pass
            else:  # chunk_i == 4
                rte3[id]['t'] = line.rstrip()
                chunk_i = 0
            chunk_i += 1
            
    return rte3

In [4]:
rte3 = convert_to_dict('rte3.txt')

## Randomising the dataset to randomly remove sentence pairs

In [5]:
import random

def shuffle_rte3(rte3_dict):
    """
    Shuffles the RTE-3 dict so that sentence pairs can randomly be skipped to balance the dataset.
    
    :param rte3_dict: dict of RTE-3
    :return: the shuffled RTE-3 dict
    """
    
    keys = list(rte3_dict.keys())
    random.Random(1).shuffle(keys)
    return {key: rte3_dict[key] for key in keys}

In [6]:
rte3_shuffled = shuffle_rte3(rte3)

## Balancing the dataset

In [7]:
def balance_dataset(rte3_shuffled):
    """
    Removes 11 cases where entailments holds for each NLP task.
    
    :param rte3: the shuffled RTE-3 dict
    :return rte3: balanced RTE-3 dict
    """
    ie = qa = ir = sum = 11
    keys = list(rte3_shuffled.keys())
    random.Random(1).shuffle(keys)
    rte3_shuffled = {key: rte3[key] for key in keys}
    for pair_id, feats in dict(rte3_shuffled).items():
        if feats['task'] == "IE" and feats['entailment_label'] == "YES" and ie != 0:
            del rte3_shuffled[pair_id]
            ie -= 1
        elif feats['task'] == "QA" and feats['entailment_label'] == "YES" and qa != 0:
            del rte3_shuffled[pair_id]
            qa -= 1
        elif feats['task'] == "IR" and feats['entailment_label'] == "YES" and ir != 0:
            del rte3_shuffled[pair_id]
            ir -= 1
        elif feats['task'] == "SUM" and feats['entailment_label'] == "YES" and sum != 0:
            del rte3_shuffled[pair_id]
            sum -= 1
            
    return rte3_shuffled

In [8]:
rte3_shuffled = balance_dataset(rte3_shuffled)

## Splitting the data into 3 sets

In [9]:
from math import ceil, floor

def make_splits(dev_split, train_split):
    """
    Divides the RTE-3 dict into two development sets and a test set where 
    each split consists of a 50-50 ratio between positive and negative pairs.
    
    :param dev_split: proportion of data to be assigned to the dev set
    :param train_split: proportion of data to be assigned to the train set
    :return: the RTE-3 dev, train and test split
    """
    
    test_count_no = dev_count_yes = train_count_yes = dev_count_no = train_count_no = 0
    dev = defaultdict(dict)
    train = defaultdict(dict)
    test = defaultdict(dict)
    for pair_id, feats in rte3_shuffled.items():
        if feats['entailment_label'] == "YES":
            if dev_count_yes < floor(orig_count_no / 100 * dev_split):
                dev[pair_id] = feats
                dev_count_yes += 1
            elif train_count_yes < ceil(orig_count_no / 100 * train_split):
                train[pair_id] = feats
                train_count_yes += 1
            else:
                test[pair_id] = feats
        else:  # NO
            if dev_count_no < floor(orig_count_no / 100 * dev_split):
                dev[pair_id] = feats
                dev_count_no += 1
            elif train_count_no < ceil(orig_count_no / 100 * train_split):
                train[pair_id] = feats
                train_count_no += 1
            else:
                test[pair_id] = feats
                
    return dev, train, test

In [10]:
dev, train, test = make_splits(60, 20)

## Verifying the splits

In [11]:
def get_new_distribution(rte3_split):
    """
    Retrieves the distribution of RTE-3 dicts.

    :param rte3_split: path to a RTE-3 dict split
    :return: dict denoting the distribution of the entailment and length label divided over the four NLP tasks
    """
    tasks = {'IE': {'yes': 0, 'no': 0, 'short': 0, 'long': 0},
             'QA': {'yes': 0, 'no': 0, 'short': 0, 'long': 0},
             'IR': {'yes': 0, 'no': 0, 'short': 0, 'long': 0},
             'SUM': {'yes': 0, 'no': 0, 'short': 0, 'long': 0}}
 
    for feats in rte3_split.values():
        if feats['entailment_label'] == 'YES':
            tasks[feats['task']]['yes'] = tasks[feats['task']]['yes'] + 1
        else:
            tasks[feats['task']]['no'] = tasks[feats['task']]['no'] + 1
        if feats['length'] == 'short':
            tasks[feats['task']]['short'] = tasks[feats['task']]['short'] + 1
        else:
            tasks[feats['task']]['long'] = tasks[feats['task']]['long'] + 1

    return tasks

In [12]:
distribution = get_new_distribution(dev)
yes = distribution['IE']['yes'] + distribution['QA']['yes'] + distribution['IR']['yes'] + distribution['SUM']['yes']
no = distribution['IE']['no'] + distribution['QA']['no'] + distribution['IR']['no'] + distribution['SUM']['no']
print('dev:', yes, no)

distribution = get_new_distribution(train)
yes = distribution['IE']['yes'] + distribution['QA']['yes'] + distribution['IR']['yes'] + distribution['SUM']['yes']
no = distribution['IE']['no'] + distribution['QA']['no'] + distribution['IR']['no'] + distribution['SUM']['no']
print('train:', yes, no)

distribution = get_new_distribution(test)
yes = distribution['IE']['yes'] + distribution['QA']['yes'] + distribution['IR']['yes'] + distribution['SUM']['yes']
no = distribution['IE']['no'] + distribution['QA']['no'] + distribution['IR']['no'] + distribution['SUM']['no']
print('test:', yes, no)

dev: 466 466
train: 156 156
test: 156 156


## Getting average sentence length (in tokens)

In [13]:
import spacy


nlp = spacy.load('nl_core_news_lg')
token_c_t_s = token_c_t_l = token_c_h = pair_c = text_s_c = text_l_c = 0
for entry in {**dev, **train, **test}.values():
    if entry['length'] == 'short':
        token_c_t_s += len([token.text for sent in nlp(entry['t']).sents for token in sent if not token.is_punct])
        text_s_c += 1
    else:  # long
        token_c_t_l += len([token.text for sent in nlp(entry['t']).sents for token in sent if not token.is_punct])
        text_l_c += 1
    token_c_h += len([token.text for sent in nlp(entry['h']).sents for token in sent if not token.is_punct])
    pair_c += 1
    
print("Average t_s length:", round(token_c_t_s / text_s_c, 1), "tokens")
print("Average t_l length:", round(token_c_t_l / text_l_c, 1), "tokens")
print("Average h length:", round(token_c_h / pair_c, 1), "tokens")

Average t_s length: 28.6 tokens
Average t_l length: 56.1 tokens
Average h length: 8.4 tokens


## Serialising the splits

In [14]:
import pickle

pickle.dump(dict(dev), open("dev.p", "wb"))
pickle.dump(dict(train), open("train.p", "wb"))
pickle.dump(dict(test), open("test.p", "wb"))