In [1]:
import codecs
import argparse
import csv
import random

from collections import defaultdict
import numpy as np
try:
    from sklearn.model_selection import train_test_split
except ImportError:
    from sklearn.cross_validation import train_test_split
    
from nltk.corpus import wordnet as wn

In [2]:
def read_subsumptions(filename):
    subsumptions = []

    with codecs.open(filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)

        for row in reader:
            subsumptions.append((row[0], row[1]))

    return subsumptions


In [279]:
orig_train = read_subsumptions('subsumptions-train.txt.orig')
train = read_subsumptions('subsumptions-train.txt')
orig_test = read_subsumptions('subsumptions-test.txt.orig')
orig_validation = read_subsumptions('subsumptions-validation.txt.orig')

all_train = read_subsumptions('all_hyper_relations.tsv')
all_train = list(set(all_train))

In [270]:
print len(train), len(orig_train)
#print len([(x,y)for x, y in train if x == 'alcohol'])
#print [(x,y)for x, y in orig_train if x == 'alcohol']

orig_d = defaultdict(list)
train_d = defaultdict(list)
orig_test_d = defaultdict(list)
all_train_d = defaultdict(list)
orig_valid_d = defaultdict(list)

for x, y in orig_train:
    orig_d[x].append(y)

for x, y in train:
    train_d[x].append(y)
    
for x, y in orig_test:
    orig_test_d[x].append(y)

for x, y in all_train:
    all_train_d[x].append(y)
    
for x, y in orig_validation:
    orig_valid_d[x].append(y)    

# convert to read-only use now    
for dic in [orig_d, train_d, orig_test_d, all_train_d, orig_valid_d]:
    dic.default_factory = None

9762 4374


In [44]:
print len(train_d), len(orig_d)

print len(train_d.keys())
print len(orig_d.keys())

1409 1355
1409
1355


In [47]:
print len(set(train_d.keys()))
print len(set(orig_d.keys()))

1409
1355


In [197]:
# the original data is of much higher quality
# for instance the hypernyms of dog in the original data set are:
# placental, mammal, animal, vertebrate, chordate, carnivore, canine, pet

# in the larger dataset the following inaccurate hypernyms are also included:
# animals, pets, species, mammals, predator, thing, creature and predators
orig_d['dog']

['placental',
 'mammal',
 'animal',
 'vertebrate',
 'chordate',
 'carnivore',
 'canine',
 'pet']

In [61]:
train_d['dog']

['placental',
 'mammal',
 'animal',
 'vertebrate',
 'chordate',
 'carnivore',
 'canine',
 'pet',
 'animals',
 'pets',
 'species',
 'mammals',
 'predator',
 'thing',
 'creature',
 'predators']

In [213]:
train_d_keys = [x for x, y in train_d.iteritems() if len(y) > 0]
orig_d_keys = [x for x, y in orig_d.iteritems() if len(y) > 0]

new_terms = list(set(train_d_keys) - set(orig_d_keys))

#for t in new_terms:
 #   print t, '\t', "\t".join(train_d[t])

In [214]:
# replace hypernyms 
test_terms = list(set([x[0] for x in orig_test]))

# we have 601 new test terms
print len(new_terms)
# ...of which 265 exist in the test set
print len(set(train_d_keys).intersection(set(test_terms)))

601
265


In [215]:
# since we need lexical split to avoid the lexical memorisation 
# problem we can only embellish our training set with 336 new terms
# this should increase the training set by ~ 1140 training relations
new_terms = list(set(new_terms)- set(test_terms))
print len(new_terms)

336


In [271]:
# iterate over new_terms, look for relations in all_train_d
# unpack list and add relation to orig_train
print len(orig_train)
for term in new_terms:    
    try:
        hypernyms = all_train_d[term]
        train_tuples = zip([term] * len(hypernyms), hypernyms)
        orig_train.extend(train_tuples)
    except KeyError:    
        print term, ' not found'
        pass

#all_train_d['flask']
print len(orig_train)



4374
5527


In [268]:
# finally 
# subsumptions-more-train.txt
with open('subsumptions-more-train.txt', 'wb') as csvfile:
    train_writer = csv.writer(csvfile, delimiter='\t')
    for relation in orig_train:    
        train_writer.writerow(relation)
    



In [218]:
term = 'roebuck'
for syn in wn.synsets(term):
    if syn.pos() == 'n':
        print syn, '-', syn.lemma_names()
        if term in syn.lemma_names():
            for hyper in syn.hypernyms():        
                print "Hypernyms: ", hyper, '-', hyper.lemma_names()
            print [(x,y) for x, y in syn.hypernym_distances() if 0 < y <= 4 ]
            print "-"*30
#[0].hypernyms()[0].lemma_names()

Synset('roebuck.n.01') - [u'roebuck']
Hypernyms:  Synset('roe_deer.n.01') - [u'roe_deer', u'Capreolus_capreolus']
[(Synset('ruminant.n.01'), 3), (Synset('deer.n.01'), 2), (Synset('roe_deer.n.01'), 1), (Synset('even-toed_ungulate.n.01'), 4)]
------------------------------


In [143]:
a = wn.synset('caterpillar.n.01')
#wn.synsets('partial')

In [145]:
print a.hypernyms()
print a.hypernym_distances()
#[x for x, y in a.hypernym_distances() if y <= 4]


[Synset('larva.n.01')]
set([(Synset('physical_entity.n.01'), 7), (Synset('animal.n.01'), 2), (Synset('object.n.01'), 6), (Synset('larva.n.01'), 1), (Synset('living_thing.n.01'), 4), (Synset('organism.n.01'), 3), (Synset('entity.n.01'), 8), (Synset('whole.n.02'), 5), (Synset('caterpillar.n.01'), 0)])


In [147]:
wn.synset('whole.n.02').lemma_names()

[u'whole', u'unit']