In [1]:
from collections import defaultdict
import gzip
import json

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/johnny/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Load benchmark
with open('swords-v1.1_dev.json', 'r') as f:
    swords = json.load(f)

# Gather substitutes by target
tid_to_sids = defaultdict(list)
for sid, substitute in swords['substitutes'].items():
    tid_to_sids[substitute['target_id']].append(sid)

# Iterate through targets
for tid, target in swords['targets'].items():
    context = swords['contexts'][target['context_id']]
    substitutes = [swords['substitutes'][sid] for sid in tid_to_sids[tid]]
    labels = [swords['substitute_labels'][sid] for sid in tid_to_sids[tid]]
    scores = [l.count('TRUE') / len(l) for l in labels]
    print('-' * 80)
    print(context['context'].replace('\n', ''))
    print('-' * 20)
    print('{} ({})'.format(target['target'], target['pos']))
    print(', '.join(['{} ({}%)'.format(substitute['substitute'], round(score * 100)) for substitute, score in sorted(zip(substitutes, scores), key=lambda x: -x[1])]))
    break

--------------------------------------------------------------------------------
Kim:				I have completed the invoices for April, May and June and we owe					Pasadena each month for a total of $3,615,910.62. I am waiting to hear					back from Patti on May and June to make sure they are okay with her.
--------------------
total (NOUN)
amount (80%), sum (80%), sum total (60%), price (60%), balance (60%), gross (60%), figure (50%), cost (50%), full amount (40%), whole (30%), number (30%), quantum (10%), aggregate (10%), bill (10%), all (0%), entirety (0%), bulk (0%), flat out (0%), final (0%), body (0%), jackpot (0%), tale (0%), totality (0%), worth (0%), result (0%), allotment (0%), quantity (0%), budget (0%), mass (0%), the works (0%)


In [3]:
good_ones = []
for tid, target in swords['targets'].items():
    context = swords['contexts'][target['context_id']]
    substitutes = [swords['substitutes'][sid] for sid in tid_to_sids[tid]]
    labels = [swords['substitute_labels'][sid] for sid in tid_to_sids[tid]]
    scores = [l.count('TRUE') / len(l) for l in labels]
    sorted_scores = sorted(zip(substitutes, scores), key=lambda x: -x[1])
    
    candidate = sorted_scores[0]
    sub = candidate[0]['substitute']
    if candidate[1] > 0.8 and (target['target'].lower() not in sub.lower()) and (' ' not in sub):
        good_ones.append((target['target'].lower(), target['pos'], candidate[0]['substitute']))


In [4]:
good_ones = list(set(good_ones))

In [5]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

pos_mapping = {'NOUN' : 'n', 'VERB' : 'v', 'ADJ' : 'a', 'ADV' : 'r'}
good_ones = [ (lemmatizer.lemmatize(word, pos_mapping[pos]), pos, lemmatizer.lemmatize(substitute, pos_mapping[pos])) \
             for word, pos, substitute in good_ones ]

In [6]:
good_ones

[('cathedral', 'NOUN', 'church'),
 ('consider', 'VERB', 'contemplate'),
 ('zone', 'NOUN', 'sector'),
 ('marvel', 'VERB', 'wonder'),
 ('just', 'ADV', 'quite'),
 ('lead', 'VERB', 'guide'),
 ('monitor', 'VERB', 'track'),
 ('look', 'VERB', 'gaze'),
 ('odd', 'ADJ', 'uncommon'),
 ('area', 'NOUN', 'location'),
 ('guy', 'NOUN', 'player'),
 ('very', 'ADV', 'incredibly'),
 ('way', 'NOUN', 'direction'),
 ('lot', 'NOUN', 'heap'),
 ('customer', 'NOUN', 'clientele'),
 ('glance', 'VERB', 'look'),
 ('box', 'NOUN', 'container'),
 ('return', 'NOUN', 'exchange'),
 ('strong', 'ADJ', 'big'),
 ('big', 'ADJ', 'huge'),
 ('next', 'ADJ', 'following'),
 ('cold', 'ADJ', 'icy'),
 ('first', 'ADJ', 'initial'),
 ('start', 'VERB', 'begin'),
 ('very', 'ADV', 'really'),
 ('representative', 'NOUN', 'delegate'),
 ('more', 'ADJ', 'great'),
 ('chance', 'NOUN', 'odds'),
 ('okay', 'ADJ', 'ok'),
 ('saw', 'VERB', 'witness'),
 ('idea', 'NOUN', 'thought'),
 ('excite', 'VERB', 'enthusiastic'),
 ('people', 'NOUN', 'folk'),
 ('logic