# Preparing the word-entailment bake-off dataset

In [1]:
__author__ = "Christopher Potts"
__version__ = "CS224u, Stanford, Spring 2019 term"

## Contents

## Set-up

In [3]:
from collections import defaultdict
import json
import pandas as pd
import random
from sklearn.model_selection import train_test_split

## Underlying data

From https://github.com/sleepinyourhat/vector-entailment/releases/tag/W15-R2:

In [4]:
wordentail_filename = 'shuffled_word_relations.tsv'

In [5]:
df = pd.read_csv(
    wordentail_filename, delimiter="\t", 
    names=['relation', 'left', 'right'])

In [6]:
df.head()

Unnamed: 0,relation,left,right
0,hyponym,bears,carry
1,hypernym,seat,portion
2,synonym,frame,adjust
3,synonym,grass,spring
4,synonym,anniversary,yearly


In [7]:
df = df[df['relation'] != 'synonym'].copy()

Some pairs have multiple relations, which should be impossible. Sample to avoid this:

In [8]:
df = df.groupby(['left', 'right']).apply(lambda x: x.sample(1))

In [9]:
df = df.drop(['left', 'right'], axis=1).reset_index(['left', 'right']).reset_index(drop=True)

In [10]:
df = df[(df['left'].isna()==False) & (df['right'].isna()==False)].copy()

In [11]:
df.shape[0]

14011

## Random train–test split

In [12]:
X_train, X_dev, y_train, y_dev = train_test_split(
    df[['left', 'right']].values, df['relation'].values, 
    random_state=42, train_size=0.5, test_size=None)

In [13]:
X_dev, X_test, y_dev, y_test = train_test_split(
    X_dev, y_dev, random_state=42, train_size=0.5, test_size=None)

In [14]:
train = list(zip(X_train, y_train))

Make sure there are no pairs in both train and test (the original dataset has a handful of repeated pairs that can lead to violations of this expectation of disjointness):

In [15]:
dev_set = {tuple(x) for x in X_dev}

In [16]:
test_set = {tuple(x) for x in X_test}

In [17]:
X_train = [[list(x), y] for x, y in train if tuple(x) not in dev_set and tuple(x) not in test_set]

In [18]:
dev = list(zip(X_dev, y_dev))

In [19]:
test = list(zip(X_test, y_test))

In [20]:
X_dev = [[list(x), y] for x, y in dev]

In [21]:
X_test = [[list(x), y] for x, y in test]

In [22]:
len(X_train)

7005

In [23]:
len(X_dev)

3503

In [24]:
len([x for x in X_dev if x not in X_test])

3503

In [25]:
len(X_test)

3503

In [26]:
len([x for x in X_test if x not in X_dev])

3503

In [27]:
len({tuple(x) for x in list(zip(*X_train))[0]})

7005

## Fully disjoint train-test split

In [28]:
vocab = sorted(set(df[['left', 'right']].values.ravel()))

In [29]:
train_size = int(len(vocab) * 0.5)

In [30]:
train_vocab = sorted(random.sample(vocab, train_size))

In [31]:
assess_vocab = sorted(set(vocab) - set(train_vocab))

In [32]:
dev_size = int(len(assess_vocab) * 0.5)

In [33]:
dev_vocab = sorted(random.sample(assess_vocab, dev_size))

In [34]:
test_vocab = sorted(set(assess_vocab) - set(dev_vocab))

In [35]:
def create_disjoint_df_split(vocab):
    return df[(df['left'].isin(vocab)) & (df['right'].isin(vocab))]


disjoint_train = create_disjoint_df_split(train_vocab)

disjoint_dev = create_disjoint_df_split(dev_vocab)

disjoint_test = create_disjoint_df_split(test_vocab)

In [36]:
disjoint_total = disjoint_train.shape[0] + disjoint_dev.shape[0] + disjoint_test.shape[0]

In [37]:
disjoint_total

5297

In [38]:
disjoint_total / df.shape[0]

0.37806009563914067

In [39]:
disjoint_train.shape[0] / disjoint_total

0.6528223522748726

In [40]:
d_train = disjoint_train['relation'].value_counts()

d_train / d_train.sum()

hypernym    0.788606
hyponym     0.133314
antonym     0.078080
Name: relation, dtype: float64

In [41]:
d_dev = disjoint_dev['relation'].value_counts()

d_dev / d_dev.sum()

hypernym    0.826667
hyponym     0.114444
antonym     0.058889
Name: relation, dtype: float64

In [42]:
d_test = disjoint_test['relation'].value_counts()

d_test / d_test.sum()

hypernym    0.792332
hyponym     0.141640
antonym     0.066028
Name: relation, dtype: float64

In [43]:
def create_disjoint_split(disjoint):
    return [[[x['left'], x['right']], x['relation']] for _, x in disjoint.iterrows()]


X_disjoint_train = create_disjoint_split(disjoint_train)

X_disjoint_dev = create_disjoint_split(disjoint_dev)

X_disjoint_test = create_disjoint_split(disjoint_test)

In [44]:
dtr, _ = zip(*X_disjoint_train)
dtr = set(map(tuple, dtr))

dd, _ = zip(*X_disjoint_dev)
dd = set(map(tuple, dd))

dt, _ = zip(*X_disjoint_test)
dt = set(map(tuple, dt))

print(dtr & dt)

print(dt & dd)

print(dtr & dd)

set()
set()
set()


## Disjoint and balanced

In [45]:
def balance_split(split):   
    data = []
    seen_left = set()    
    seen_right = set()
    random.shuffle(split)
    for (left, right), label in split:
        if (left, label) not in seen_left and (right, label) not in seen_right:
            data.append([[left, right], label])
            seen_left.add((left, label))
            seen_right.add((right, label))
    return data

In [46]:
X_balanced_train = balance_split(X_disjoint_train)

In [47]:
X_balanced_dev = balance_split(X_disjoint_dev)

In [48]:
X_balanced_test = balance_split(X_disjoint_test)

In [49]:
len(X_balanced_train)

1087

In [50]:
len(X_balanced_dev)

376

In [51]:
len(X_balanced_test)

420

In [52]:
pd.DataFrame(X_balanced_train)[1].value_counts()

hypernym    724
hyponym     183
antonym     180
Name: 1, dtype: int64

In [53]:
pd.DataFrame(X_balanced_dev)[1].value_counts()

hypernym    276
hyponym      55
antonym      45
Name: 1, dtype: int64

In [54]:
pd.DataFrame(X_balanced_test)[1].value_counts()

hypernym    295
hyponym      73
antonym      52
Name: 1, dtype: int64

In [55]:
vocab = {w for ex, label in X_train+X_dev+X_test for w in ex}

In [56]:
len(vocab)

5178

## Output in JSON

In [57]:
data = {
    'edge_disjoint': {
        'train': X_train, 
        'dev': X_dev
    },
    'word_disjoint': {
        'train': X_disjoint_train,
        'dev': X_disjoint_dev
    },
    'word_disjoint_balanced': {
        'train': X_balanced_train,
        'dev': X_balanced_dev        
    },
    'vocab': sorted(vocab)
}

In [58]:
test_data = {
    'edge_disjoint': {
        'test': X_test
    },
    'word_disjoint': {
        'test': X_disjoint_test
    },
    'word_disjoint_balanced': {
        'test': X_balanced_test        
    },
    'vocab': sorted(vocab)
}

In [59]:
len(data['edge_disjoint']['train'])

7005

In [60]:
len(data['word_disjoint']['dev'])

900

In [61]:
len(test_data['word_disjoint']['test'])

939

In [62]:
len(data['edge_disjoint']['train'])

7005

In [63]:
len(data['edge_disjoint']['dev'])

3503

In [64]:
len(test_data['word_disjoint']['test'])

939

In [65]:
len(data['word_disjoint_balanced']['train'])

1087

In [66]:
len(data['word_disjoint_balanced']['dev'])

376

In [67]:
len(test_data['word_disjoint_balanced']['test'])

420

In [68]:
with open('nli_wordentail_bakeoff_data.json', 'wt') as f:
    json.dump(data, f, indent=4, sort_keys=True)

In [69]:
with open('nli_wordentail_bakeoff_data-test.json', 'wt') as f:
    json.dump(test_data, f, indent=4, sort_keys=True)

## Final test

In [70]:
with open('nli_wordentail_bakeoff_data.json', 'rt') as f:
    data = json.load(f)
    
with open('nli_wordentail_bakeoff_data-test.json', 'rt') as f:
    test_data = json.load(f)    

In [71]:
for cond in ('edge_disjoint', 'word_disjoint', 'word_disjoint_balanced'):
    for x in test_data[cond]['test']:
        assert x not in data[cond]['train'] 
        assert x not in data[cond]['dev'] 
    for k in ('train', 'dev'):
        for x in data[cond][k]:
            assert x not in test_data[cond]['test']
            assert x not in data[cond]['dev' if k == 'train' else 'train']