In [21]:
from collections import defaultdict, Counter
import gzip
import json

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/johnny/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/johnny/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [13]:
# Load benchmark
with open('swords-v1.1_test.json', 'r') as f:
    swords = json.load(f)

# Gather substitutes by target
tid_to_sids = defaultdict(list)
for sid, substitute in swords['substitutes'].items():
    tid_to_sids[substitute['target_id']].append(sid)

# Iterate through targets
for tid, target in swords['targets'].items():
    context = swords['contexts'][target['context_id']]
    substitutes = [swords['substitutes'][sid] for sid in tid_to_sids[tid]]
    labels = [swords['substitute_labels'][sid] for sid in tid_to_sids[tid]]
    scores = [l.count('TRUE') / len(l) for l in labels]
    print('-' * 80)
    print(context['context'].replace('\n', ''))
    print('-' * 20)
    print('{} ({})'.format(target['target'], target['pos']))
    print(', '.join(['{} ({}%)'.format(substitute['substitute'], round(score * 100)) for substitute, score in sorted(zip(substitutes, scores), key=lambda x: -x[1])]))
    break

--------------------------------------------------------------------------------
"Noriega has closed every independent newspaper, radio and television station and arrested, tortured or forced into exile a long list of reporters," the statement declared.It added: "In Cuba, public enemy No. 1 of press freedoms in the hemisphere, repression of journalists both Cuban and foreign is worse than ever."And in Nicaragua, promises of press freedom by the Sandinistas "have not materialized."
--------------------
press (NOUN)
journalist (70%), media (70%), newspaper (60%), reporter (40%), communication (30%), columnist (30%), printing (20%), report (10%), voice (0%), publicist (0%), journalism (0%), periodical (0%), verbal (0%), medium (0%), paper (0%), pressure (0%), correspondent (0%), push (0%), multitude (0%), mob (0%), confusion (0%), herd (0%), drove (0%), photographer (0%), stress (0%), crowd (0%), swarm (0%), bustle (0%), demand (0%), urgency (0%), haste (0%), writer (0%), people or person

In [14]:
good_ones = []
for tid, target in swords['targets'].items():
    context = swords['contexts'][target['context_id']]
    substitutes = [swords['substitutes'][sid] for sid in tid_to_sids[tid]]
    labels = [swords['substitute_labels'][sid] for sid in tid_to_sids[tid]]
    scores = [l.count('TRUE') / len(l) for l in labels]
    sorted_scores = sorted(zip(substitutes, scores), key=lambda x: -x[1])
    
    candidate = sorted_scores[0]
    sub = candidate[0]['substitute']
    if candidate[1] > 0.8 and (target['target'].lower() not in sub.lower()) and (' ' not in sub):
        good_ones.append((target['target'].lower(), target['pos'], candidate[0]['substitute']))


In [15]:
good_ones = list(set(good_ones))

In [16]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

pos_mapping = {'NOUN' : 'n', 'VERB' : 'v', 'ADJ' : 'a', 'ADV' : 'r'}
good_ones = [ (lemmatizer.lemmatize(word, pos_mapping[pos]), pos, lemmatizer.lemmatize(substitute, pos_mapping[pos])) \
             for word, pos, substitute in good_ones ]

In [None]:
word_pairs = dev + test

In [54]:
# manually removed duplicates
word_pairs = [('odd', 'ADJ', 'uncommon'),
 ('knock', 'VERB', 'push'),
 ('pulse', 'NOUN', 'beating'),
 ('raise', 'VERB', 'lift'),
 ('cathedral', 'NOUN', 'church'),
 ('nice', 'ADJ', 'good'),
 ('size', 'NOUN', 'proportion'),
 ('idiot', 'NOUN', 'imbecile'),
 ('way', 'NOUN', 'direction'),
 ('small', 'ADJ', 'little'),
 ('guy', 'NOUN', 'player'),
 ('personnel', 'NOUN', 'staff'),
 ('start', 'VERB', 'begin'),
 ('call', 'VERB', 'summon'),
 ('logical', 'ADJ', 'rational'),
 ('saw', 'VERB', 'witness'),
 ('leech', 'NOUN', 'bloodsucker'),
 ('thin', 'ADJ', 'slender'),
 ('marvel', 'VERB', 'wonder'),
 ('just', 'ADV', 'quite'),
 ('first', 'ADJ', 'initial'),
 ('consider', 'VERB', 'contemplate'),
 ('next', 'ADJ', 'following'),
 ('permit', 'VERB', 'allow'),
 ('box', 'NOUN', 'container'),
 ('business', 'NOUN', 'operation'),
 ('monitor', 'VERB', 'track'),
 ('think', 'VERB', 'reckon'),
 ('big', 'ADJ', 'huge'),
 ('more', 'ADJ', 'great'),
 ('chance', 'NOUN', 'odds'),
 ('customer', 'NOUN', 'clientele'),
 ('trouble', 'NOUN', 'difficulty'),
 ('return', 'NOUN', 'exchange'),
 ('cold', 'ADJ', 'icy'),
 ('zone', 'NOUN', 'sector'),
 ('people', 'NOUN', 'folk'),
 ('office', 'NOUN', 'workplace'),
 ('hire', 'VERB', 'enlist'),
 ('playwright', 'NOUN', 'writer'),
 ('glance', 'VERB', 'look'),
 ('lead', 'VERB', 'guide'),
 ('damage', 'NOUN', 'harm'),
 ('representative', 'NOUN', 'delegate'),
 ('very', 'ADV', 'really'),
 ('totally', 'ADV', 'absolutely'),
 ('help', 'VERB', 'assist'),
 ('lot', 'NOUN', 'heap'),
 ('okay', 'ADJ', 'ok'),
 ('excite', 'VERB', 'enthusiastic'),
 ('ease', 'VERB', 'lighten'),
 ('innocent', 'ADJ', 'harmless'),
 ('area', 'NOUN', 'location'),
 ('idea', 'NOUN', 'thought'),
 ('device', 'NOUN', 'equipment'),
 ('run', 'VERB', 'bolt'),
 ('equal', 'ADJ', 'equivalent'),
 ('subject', 'NOUN', 'topic'),
 ('man', 'NOUN', 'person'),
 ('newspaper', 'NOUN', 'paper'),
 ('own', 'ADJ', 'hold'),
 ('bath', 'NOUN', 'bathroom'),
 ('reply', 'VERB', 'answer'),
 ('interior', 'NOUN', 'inside'),
 ('choose', 'VERB', 'pick'),
 ('company', 'NOUN', 'corporation'),
 ('disposal', 'NOUN', 'discarding'),
 ('likely', 'ADV', 'presumably'),
 ('house', 'NOUN', 'home'),
 ('shallow', 'ADJ', 'empty'),
 ('safety', 'NOUN', 'safeness'),
 ('file', 'NOUN', 'record'),
 ('kindness', 'NOUN', 'goodness'),
 ('try', 'VERB', 'attempt'),
 ('cry', 'VERB', 'weep'),
 ('provide', 'VERB', 'supply'),
 ('analysis', 'NOUN', 'evaluation'),
 ('interesting', 'ADJ', 'fascinating'),
 ('caution', 'VERB', 'warn'),
 ('ugly', 'ADJ', 'unpleasant'),
 ('fell', 'VERB', 'decrease'),
 ('full', 'ADJ', 'whole'),
 ('move', 'VERB', 'go'),
 ('cut', 'VERB', 'reduce'),
 ('prove', 'VERB', 'affirm'),
 ('many', 'ADJ', 'multiple'),
 ('brilliant', 'ADJ', 'glowing'),
 ('income', 'NOUN', 'earnings'),
 ('delighted', 'ADJ', 'ecstatic'),
 ('cease', 'VERB', 'halt'),
 ('perhaps', 'ADV', 'maybe'),
 ('journalist', 'NOUN', 'newspeople'),
 ('love', 'VERB', 'cherish'),
 ('forgive', 'VERB', 'pardon'),
 ('joy', 'NOUN', 'glee'),
 ('jealous', 'ADJ', 'envious'),
 ('guess', 'VERB', 'predict'),
 ('old', 'ADJ', 'elderly'),
 ('toss', 'VERB', 'throw'),
 ('accept', 'VERB', 'recognize'),
 ('gentle', 'ADJ', 'soft'),
 ('say', 'VERB', 'state'),
 ('grow', 'VERB', 'increase'),
 ('heavy', 'ADJ', 'hefty'),
 ('stature', 'NOUN', 'status'),
 ('voice', 'NOUN', 'sound'),
 ('team', 'NOUN', 'group')]

In [55]:
c = Counter([ i[0] for i in word_pairs ] + [ i[2] for i in word_pairs ])
print(c)

Counter({'odd': 1, 'knock': 1, 'pulse': 1, 'raise': 1, 'cathedral': 1, 'nice': 1, 'size': 1, 'idiot': 1, 'way': 1, 'small': 1, 'guy': 1, 'personnel': 1, 'start': 1, 'call': 1, 'logical': 1, 'saw': 1, 'leech': 1, 'thin': 1, 'marvel': 1, 'just': 1, 'first': 1, 'consider': 1, 'next': 1, 'permit': 1, 'box': 1, 'business': 1, 'monitor': 1, 'think': 1, 'big': 1, 'more': 1, 'chance': 1, 'customer': 1, 'trouble': 1, 'return': 1, 'cold': 1, 'zone': 1, 'people': 1, 'office': 1, 'hire': 1, 'playwright': 1, 'glance': 1, 'lead': 1, 'damage': 1, 'representative': 1, 'very': 1, 'totally': 1, 'help': 1, 'lot': 1, 'okay': 1, 'excite': 1, 'ease': 1, 'innocent': 1, 'area': 1, 'idea': 1, 'device': 1, 'run': 1, 'equal': 1, 'subject': 1, 'man': 1, 'newspaper': 1, 'own': 1, 'bath': 1, 'reply': 1, 'interior': 1, 'choose': 1, 'company': 1, 'disposal': 1, 'likely': 1, 'house': 1, 'shallow': 1, 'safety': 1, 'file': 1, 'kindness': 1, 'try': 1, 'cry': 1, 'provide': 1, 'analysis': 1, 'interesting': 1, 'caution': 1,