# Preparing the word-entailment bake-off dataset

In [1]:
__author__ = "Christopher Potts"
__version__ = "CS224u, Stanford, Spring 2019 term"

## Contents

## Set-up

In [2]:
from collections import defaultdict
import json
import pandas as pd
import random
from sklearn.model_selection import train_test_split

## Underlying data

From https://github.com/sleepinyourhat/vector-entailment/releases/tag/W15-R2:

In [3]:
wordentail_filename = 'shuffled_word_relations.tsv'

In [4]:
df = pd.read_csv(
    wordentail_filename, delimiter="\t", 
    names=['relation', 'left', 'right'])

In [5]:
df.head()

Unnamed: 0,relation,left,right
0,hyponym,bears,carry
1,hypernym,seat,portion
2,synonym,frame,adjust
3,synonym,grass,spring
4,synonym,anniversary,yearly


Some pairs have multiple relations, which should be impossible. Sample to avoid this:

In [6]:
df = df.groupby(['left', 'right']).apply(lambda x: x.sample(1))

In [7]:
df = df.drop(['left', 'right'], axis=1).reset_index(['left', 'right']).reset_index(drop=True)

In [8]:
df = df[(df['left'].isna()==False) & (df['right'].isna()==False)].copy()

In [9]:
df.shape[0]

28355

## Random train–test split

In [36]:
X_train, X_dev, y_train, y_dev = train_test_split(
    df[['left', 'right']].values, df['relation'].values, 
    random_state=42, train_size=0.5, test_size=None)

In [37]:
X_dev, X_test, y_dev, y_test = train_test_split(
    X_dev, y_dev, random_state=42, train_size=0.5, test_size=None)

In [38]:
train = list(zip(X_train, y_train))

Make sure there are no pairs in both train and test (the original dataset has a handful of repeated pairs that can lead to violations of this expectation of disjointness):

In [39]:
dev_set = {tuple(x) for x in X_dev}

In [40]:
test_set = {tuple(x) for x in X_test}

In [41]:
X_train = [[list(x), y] for x, y in train if tuple(x) not in dev_set and tuple(x) not in test_set]

In [42]:
dev = list(zip(X_dev, y_dev))

In [43]:
test = list(zip(X_test, y_test))

In [44]:
X_dev = [[list(x), y] for x, y in dev]

In [45]:
X_test = [[list(x), y] for x, y in test]

In [46]:
len(X_train)

14177

In [47]:
len(X_dev)

7089

In [50]:
len([x for x in X_dev if x not in X_test])

7089

In [48]:
len(X_test)

7089

In [52]:
len([x for x in X_test if x not in X_dev])

7089

In [53]:
len({tuple(x) for x in list(zip(*X_train))[0]})

14177

## Fully disjoint train-test split

In [54]:
vocab = sorted(set(df[['left', 'right']].values.ravel()))

In [55]:
train_size = int(len(vocab) * 0.5)

In [56]:
train_vocab = sorted(random.sample(vocab, train_size))

In [57]:
assess_vocab = sorted(set(vocab) - set(train_vocab))

In [58]:
dev_size = int(len(assess_vocab) * 0.5)

In [59]:
dev_vocab = sorted(random.sample(assess_vocab, dev_size))

In [61]:
test_vocab = sorted(set(assess_vocab) - set(dev_vocab))

In [62]:
def create_disjoint_df_split(vocab):
    return df[(df['left'].isin(vocab)) & (df['right'].isin(vocab))]


disjoint_train = create_disjoint_df_split(train_vocab)

disjoint_dev = create_disjoint_df_split(dev_vocab)

disjoint_test = create_disjoint_df_split(test_vocab)

In [63]:
disjoint_total = disjoint_train.shape[0] + disjoint_dev.shape[0] + disjoint_test.shape[0]

In [64]:
disjoint_total

10688

In [65]:
disjoint_total / df.shape[0]

0.3769352847822254

In [66]:
disjoint_train.shape[0] / disjoint_total

0.6597118263473054

In [67]:
d_train = disjoint_train['relation'].value_counts()

d_train / d_train.sum()

synonym     0.508013
hypernym    0.403631
hyponym     0.057580
antonym     0.030776
Name: relation, dtype: float64

In [68]:
d_dev = disjoint_dev['relation'].value_counts()

d_dev / d_dev.sum()

synonym     0.532937
hypernym    0.357991
hyponym     0.078294
antonym     0.030778
Name: relation, dtype: float64

In [69]:
d_test = disjoint_test['relation'].value_counts()

d_test / d_test.sum()

synonym     0.534454
hypernym    0.365826
hyponym     0.052661
antonym     0.047059
Name: relation, dtype: float64

In [70]:
def create_disjoint_split(disjoint):
    return [[[x['left'], x['right']], x['relation']] for _, x in disjoint.iterrows()]


X_disjoint_train = create_disjoint_split(disjoint_train)

X_disjoint_dev = create_disjoint_split(disjoint_dev)

X_disjoint_test = create_disjoint_split(disjoint_test)

In [73]:
dtr, _ = zip(*X_disjoint_train)
dtr = set(map(tuple, dtr))

dd, _ = zip(*X_disjoint_dev)
dd = set(map(tuple, dd))

dt, _ = zip(*X_disjoint_test)
dt = set(map(tuple, dt))

print(dtr & dt)

print(dt & dd)

print(dtr & dd)

set()
set()
set()


## Disjoint and balanced

In [74]:
def balance_split(split):   
    data = []
    seen_left = set()    
    seen_right = set()
    random.shuffle(split)
    for (left, right), label in split:
        if (left, label) not in seen_left and (right, label) not in seen_right:
            data.append([[left, right], label])
            seen_left.add((left, label))
            seen_right.add((right, label))
    return data

In [75]:
X_balanced_train = balance_split(X_disjoint_train)

In [76]:
X_balanced_dev = balance_split(X_disjoint_dev)

In [77]:
X_balanced_test = balance_split(X_disjoint_test)

In [78]:
len(X_balanced_train)

2212

In [80]:
len(X_balanced_dev)

796

In [81]:
len(X_balanced_test)

835

In [82]:
pd.DataFrame(X_balanced_train)[1].value_counts()

synonym     1150
hypernym     731
hyponym      177
antonym      154
Name: 1, dtype: int64

In [83]:
pd.DataFrame(X_balanced_dev)[1].value_counts()

synonym     430
hypernym    253
hyponym      66
antonym      47
Name: 1, dtype: int64

In [84]:
pd.DataFrame(X_balanced_test)[1].value_counts()

synonym     426
hypernym    287
antonym      64
hyponym      58
Name: 1, dtype: int64

In [85]:
vocab = {w for ex, label in X_train+X_dev+X_test for w in ex}

In [86]:
len(vocab)

6560

## Output in JSON

In [87]:
data = {
    'edge_disjoint': {
        'train': X_train, 
        'dev': X_dev
    },
    'word_disjoint': {
        'train': X_disjoint_train,
        'dev': X_disjoint_dev
    },
    'word_disjoint_balanced': {
        'train': X_balanced_train,
        'dev': X_balanced_dev        
    },
    'vocab': sorted(vocab)
}

In [88]:
test_data = {
    'edge_disjoint': {
        'test': X_test
    },
    'word_disjoint': {
        'test': X_disjoint_test
    },
    'word_disjoint_balanced': {
        'test': X_balanced_test        
    },
    'vocab': sorted(vocab)
}

In [93]:
len(data['edge_disjoint']['train'])

14177

In [94]:
len(data['word_disjoint']['dev'])

1852

In [96]:
len(test_data['word_disjoint']['test'])

1785

In [91]:
len(data['edge_disjoint']['train'])

7089

In [97]:
len(data['edge_disjoint']['dev'])

7089

In [98]:
len(test_data['word_disjoint']['test'])

1785

In [99]:
len(data['word_disjoint_balanced']['train'])

2212

In [100]:
len(data['word_disjoint_balanced']['dev'])

796

In [101]:
len(test_data['word_disjoint_balanced']['test'])

835

In [102]:
with open('nli_wordentail_bakeoff_data.json', 'wt') as f:
    json.dump(data, f, indent=4, sort_keys=True)

In [104]:
with open('nli_wordentail_bakeoff_data-test.json', 'wt') as f:
    json.dump(test_data, f, indent=4, sort_keys=True)

## Final test

In [107]:
with open('nli_wordentail_bakeoff_data.json', 'rt') as f:
    data = json.load(f)
    
with open('nli_wordentail_bakeoff_data-test.json', 'rt') as f:
    test_data = json.load(f)    

In [118]:
for cond in ('edge_disjoint', 'word_disjoint', 'word_disjoint_balanced'):
    for x in test_data[cond]['test']:
        assert x not in data[cond]['train'] 
        assert x not in data[cond]['dev'] 
    for k in ('train', 'dev'):
        for x in data[cond][k]:
            assert x not in test_data[cond]['test']
            assert x not in data[cond]['dev' if k == 'train' else 'train']