<b>Evaluation of models by means of lexical substitution</b><br/>




In [2]:
# Imports
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import defaultdict, Counter
import tensorflow as tf
import random
import time
import operator
from functools import reduce
import pickle
#os.listdir('small_dataset')

In [69]:
# Helper functions

def prod(it):
    return reduce(operator.mul, it, 1)

def get_candidate_dict():
    with open('lst/lst.gold.candidates','r') as f:
        lines = map(str.strip,f.readlines())
        d = defaultdict(list)
        for line in lines:
            target, candidates = line.split('::')
            d[target] = candidates.split(';')
    return d

def cos(v1, v2):
    # Calculates the vector cosine similarity between v1 and v2
    # Returns a value between -1 and 1, 1 meaning exactly same
    #  and -1 meaning exactly opposite.
    assert len(v1) == len(v2)
    numerator = sum([v1[i]*v2[i] for i in range(len(v1))])
    denominator = np.sqrt(sum([x**2 for x in v1])) \
                * np.sqrt(sum([x**2 for x in v2]))
    
    return (numerator/denominator)

def pcos(v1,v2):
    return 0.5*(cos(v1,v2)+1)

def add(target, sub, context):
    # target: embedding of target word
    # sub   : embedding of substitution word
    # context: list of embeddings of context words
    return (cos(sub,target) + sum([cos(sub,c) for c in context]))/(len(context)+1)

def mult(target, sub,context):
    return (pcos(sub,target) * prod([pcos(sub, c) for c in context]))**(1/(2*len(context)))

def load_skipgram():
   
    with open('skipgram_embedding.matrix','rb') as f:
        embed_mat = pickle.load(f)
    print("Embedding loaded.")
    with open('i2w.skip','rb') as f:
        int2word_skip = pickle.load(f)
    with open('w2i.skip','rb') as f:
        word2int_skip = pickle.load(f)
    return embed_mat, int2word_skip, word2int_skip

def get_embedding(word, embed_mat, word2int):
    try:
        idx = word2int[word]
    except KeyError:
        # KeyError will return the UNK vector
        idx = word2int['<UNK>']
    return embed_mat[idx,:]

def word2embed_skip(embedding_mat, w2i,*input_words):

    if len(input_words) > 1:
        result = [get_embedding(x, embedding_mat, w2i) for x in input_words]
        result = [x for x in result if x is not None]
    else:
        result = get_embedding(input_words[0], embedding_mat, w2i)
    return result

def result2line(target, sent_id, sorted_results):
    line = 'RANKED\t{} {}\t'.format(target,sent_id)
    for (word, score) in sorted_results:
        line += '{} {}\t'.format(word,score)
    line +='\n'
    return line


In [70]:
# Lexical substitution model
def eval_model(load_model_func):
    # varname_e means embedded vector of word in var <varname>
    cand_dict = get_candidate_dict()
    punc_set = ['.',',',':',';','_','-']
    embedding_mat, i2w,w2i = load_model_func()
    print("VOCABULARY HAS SIZE {}".format(embedding_mat.shape[0]))
    with open('lst/lst_test.preprocessed','r') as f:
        lines = list(map(str.strip,f.readlines()))
    
   # with open('lst.out','w') as f:
    
    add_lines = []
    mult_lines = []
    global THING
    THING = 0
    st = 0
    dt = 0
    l = len(lines)
    
    for i,line in enumerate(lines):
        s = time.time()
        
        if i%100 == 0:
            print("EVAL {}/{}".format(i,l))
            print("avg it: {} sec".format(dt))
        target, s_id, pos, sent = line.split('\t')
        candidates = cand_dict[target]
        context = [x for i,x in enumerate(sent.split(' ')) if i != int(pos) and x not in punc_set]
        #print(i,'context')
        context_e = [get_embedding(c,embedding_mat, w2i) for c in context]
        #print(i,'target')
        target_e = get_embedding(target.split('.')[0],embedding_mat, w2i)
        
        results_add = {}
        results_mult = {}
        
        for sub in candidates:
            #print(sub)
            sub_e = get_embedding(sub,embedding_mat, w2i)
            
            #add_score = add(target_e, sub_e, context_e)
            #mult_score = mult(target_e, sub_e, context_e)
            #results_add[sub] = add_score
            #results_mult[sub] = mult_score
        #add_sort = sorted(results_add.items(), key=operator.itemgetter(1), reverse=True)
        #mult_sort = sorted(results_mult.items(), key=operator.itemgetter(1), reverse=True)
        
        #add_lines.append(result2line(target,s_id,add_sort))
        #mult_lines.append(result2line(target,s_id, mult_sort))
        st += time.time() - s
        dt = st / (i+1)
    #add_file = open('lst_add.out','w')
    #for line in add_lines:
        #add_file.write(line)
    #add_file.close()
    #mult_file = open('lst_mult.out','w')
    #for line in mult_lines:
        #mult_file.write(line)
    #mult_file.close()
    print('DONE.')
            
    
eval_model(load_skipgram)    

Embedding loaded.
VOCABULARY HAS SIZE 13130
EVAL 0/1710
avg it: 0 sec
EVAL 100/1710
avg it: 4.0099620819091796e-05 sec
EVAL 200/1710
avg it: 3.6215782165527345e-05 sec
EVAL 300/1710
avg it: 3.5810470581054685e-05 sec
EVAL 400/1710
avg it: 3.4759044647216795e-05 sec
EVAL 500/1710
avg it: 3.635406494140625e-05 sec
EVAL 600/1710
avg it: 3.6938985188802084e-05 sec
EVAL 700/1710
avg it: 3.6513124193464006e-05 sec
EVAL 800/1710
avg it: 3.563851118087768e-05 sec
EVAL 900/1710
avg it: 3.57712639702691e-05 sec
EVAL 1000/1710
avg it: 3.5758256912231444e-05 sec
EVAL 1100/1710
avg it: 3.636425191705877e-05 sec
EVAL 1200/1710
avg it: 3.691732883453369e-05 sec
EVAL 1300/1710
avg it: 3.699229313777043e-05 sec
EVAL 1400/1710
avg it: 3.6839246749877926e-05 sec
EVAL 1500/1710
avg it: 3.66363525390625e-05 sec
EVAL 1600/1710
avg it: 3.6673694849014284e-05 sec
EVAL 1700/1710
avg it: 3.669107661527746e-05 sec
DONE.


In [22]:
embedding_mat, i2w,w2i = load_skipgram()
#word2embed_skip(embedding_mat, w2i, '<UNK>')
embedding_mat.shape

Embedding loaded.


(13130, 300)

In [16]:
embedding.shape

NameError: name 'embedding' is not defined

In [59]:
# Rewrites wrong output files to right format.
with open('lst_addw3.out','r') as f:
    lines = f.readlines()
with open('lst_add1.out','w') as f:
    i = 0
    
    for line in lines:
        if i < 100:
            b = line.split('\t')
            #print(b[1].split('.')[0])
            t = b[1].split('.')[0]
            for i,w in enumerate(b[3:]):
                #print(w.split('0.')[0].strip())
                if w.split('0.')[0].strip() == t:
                    print(w.split('.0'))
                    del b[i+3]
                    print(i+3)
                    print(line)
                    
            #print([x.split(' ')[0] for x in b[3:]])
            s = b[0]+'\t'+b[1]+' '+b[2]
            for w in b[3:]:
                s += '\t'+w
            #print(s)
            f.write(s)
            #i += 1

['solid 0', '6225331702036535']
8
RANKED	solid.a 1081	dependable 0.20588213820742507	sturdy 0.20588213820742507	cemented 0.20588213820742507	set 0.09908956119204067	good 0.0763110130114627	valid 0.0680356265077055	solid 0.06225331702036535	dry 0.058458392485820035	respectable 0.057812489214126773	firm 0.05410070791891403	sound 0.052414873687468035	secure 0.052180210545788086	convincing 0.05210727499444877	strong 0.04921971025685812	genuine 0.04740551441274434	stable 0.04197998682295047	substantive 0.03879461806182938	accurate 0.03703415098676676	reliable 0.03683742150357246	concrete 0.035923298452400054	rigid 0.027926648439387802	fixed 0.02377629184580367	hard 0.020089884795165155	substantial 0.0060076314035449755	

['solid 0', '8974272608098804']
3
RANKED	solid.a 1082	set 0.09039260943000081	solid 0.08974272608098804	respectable 0.07468367570254071	good 0.06869220921774132	genuine 0.06706521386505994	sound 0.05848737712814536	fixed 0.053576417404430525	substantive 0.04947083899544619	

In [34]:
a = [(1,2),(2,3),(5,6)]
for i,b in enumerate(a):
    if 1 in b:
        print(b)
        print(i)
        del a[i]
print(a)

(1, 2)
0
[(2, 3), (5, 6)]


In [4]:
# Restoring the model
sess = tf.Session()
saver = tf.train.import_meta_graph('skip_checkpoints/text8.ckpt.meta')
saver.restore(sess, tf.train.latest_checkpoint('skip_checkpoints'))
graph = tf.get_default_graph()
embed_tensor = graph.get_tensor_by_name('embedding:0')
embedding_matrix= sess.run(embed_tensor)


INFO:tensorflow:Restoring parameters from skip_checkpoints/text8.ckpt


In [3]:
os.listdir('lst')

['lst_test.gold',
 'lst.gold.candidates',
 'score.pl',
 'generalized_average_precision.py',
 'lst.wn.candidates',
 'lst_gap.py',
 'README',
 'gap-score-file',
 'lst_all.gold',
 'scoreFA.pl',
 'lst_test.preprocessed']

1
