<b>Evaluation of models by means of lexical substitution</b><br/>




In [2]:
# Imports
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import defaultdict, Counter
import tensorflow as tf
import random
import time
import operator
from functools import reduce
import pickle
#os.listdir('small_dataset')

In [4]:
# Helper functions

def prod(it):
    return reduce(operator.mul, it, 1)

def get_candidate_dict():
    with open('lst/lst.gold.candidates','r') as f:
        lines = map(str.strip,f.readlines())
        d = defaultdict(list)
        for line in lines:
            target, candidates = line.split('::')
            d[target] = candidates.split(';')
    return d

def cos(v1, v2):
    # Calculates the vector cosine similarity between v1 and v2
    # Returns a value between -1 and 1, 1 meaning exactly same
    #  and -1 meaning exactly opposite.
    assert len(v1) == len(v2)
    numerator = sum([v1[i]*v2[i] for i in range(len(v1))])
    denominator = np.sqrt(sum([x**2 for x in v1])) \
                * np.sqrt(sum([x**2 for x in v2]))
    
    return (numerator/denominator)

def pcos(v1,v2):
    return 0.5*(cos(v1,v2)+1)

def add(target, sub, context):
    # target: embedding of target word
    # sub   : embedding of substitution word
    # context: list of embeddings of context words
    return (cos(sub,target) + sum([cos(sub,c) for c in context]))/(len(context)+1)

def mult(target, sub,context):
    return (pcos(sub,target) * prod([pcos(sub, c) for c in context]))**(1/(2*len(context)))

def load_skipgram():
   
    with open('skipgram_embedding.matrix','rb') as f:
        embed_mat = pickle.load(f)
    print("Embedding loaded.")
    with open('i2w.skip','rb') as f:
        int2word_skip = pickle.load(f)
    with open('w2i.skip','rb') as f:
        word2int_skip = pickle.load(f)
    return embed_mat, int2word_skip, word2int_skip

def get_embedding(word, embed_mat, word2int):
    try:
        idx = word2int[word]
    except KeyError:
        # KeyError will return the UNK vector
        idx = word2int['<UNK>']
    return embed_mat[idx,:]

def word2embed_skip(embedding_mat, w2i,*input_words):

    if len(input_words) > 1:
        result = [get_embedding(x, embedding_mat, w2i) for x in input_words]
        result = [x for x in result if x is not None]
    else:
        result = get_embedding(input_words[0], embedding_mat, w2i)
    return result

def result2line(target, sent_id, sorted_results):
    line = 'RANKED\t{}\t{}\t'.format(target,sent_id)
    for (word, score) in sorted_results:
        line += '{} {}\t'.format(word,score)
    line +='\n'
    return line


In [None]:
def eval_model(embed_func, load_func):
    # varname_e means embedded vector of word in var <varname>
    cand_dict = get_candidate_dict()
    punc_set = ['.',',',':',';','_','-']
    embedding_mat, i2w,w2i = load_func()
    
    with open('lst/lst_test.preprocessed','r') as f:
        lines = list(map(str.strip,f.readlines()))
    
   # with open('lst.out','w') as f:
    add_file = open('lst_add.out','w')
    mult_file = open('lst_mult.out','w')
    st = 0
    dt = 0
    l = len(lines)
    for i,line in enumerate(lines):
        s = time.time()
        
        if i%500 == 0:
            print("EVAL {}/{}".format(i,l))
            print("avg it: {} sec".format(dt))
        target, s_id, pos, sent = line.split('\t')
        candidates = cand_dict[target]
        context = [x for i,x in enumerate(sent.split(' ')) if i != int(pos) and x not in punc_set]
        #print(i,'context')
        context_e = [get_embedding(c,embedding_mat, w2i) for c in context]
        #print(i,'target')
        target_e = get_embedding(target.split('.')[0],embedding_mat, w2i)
        
        results_add = {}
        results_mult = {}
        
        for sub in candidates:
            #print(sub)
            sub_e = get_embedding(sub,embedding_mat, w2i)
            
            add_score = add(target_e, sub_e, context_e)
            mult_score = mult(target_e, sub_e, context_e)
            results_add[sub] = add_score
            results_mult[sub] = mult_score
        add_sort = sorted(results_add.items(), key=operator.itemgetter(1), reverse=True)
        mult_sort = sorted(results_mult.items(), key=operator.itemgetter(1), reverse=True)
        add_file.write(result2line(target,s_id,add_sort))
        mult_file.write(result2line(target,s_id, mult_sort))
        st += time.time() - s
        dt = st / (i+1)
    add_file.close()
    mult_file.close()
    print('DONE.')
            
    
eval_model(word2embed_skip, load_skipgram)    

Embedding loaded.
EVAL 0/1710
avg it: 0 sec


In [6]:
embedding_mat, i2w,w2i = load_skipgram()
word2embed_skip(embedding_mat, w2i, '<UNK>')

Embedding loaded.


array([ 0.09630829,  0.7440492 ,  0.34276482,  0.8761543 , -0.94496965,
        0.23159432,  0.4099211 , -0.68691844,  0.3098697 ,  0.33363786,
       -0.2250453 , -0.3840212 ,  0.14566989,  0.16685817,  0.3829214 ,
       -0.5618807 , -0.05575426, -0.53012216, -0.08547916, -0.14512509,
        0.46568826, -0.6449125 , -0.4510265 ,  0.8284293 ,  0.3008002 ,
        0.103365  , -0.55712   , -0.02216346, -0.4562141 ,  0.65446734,
       -0.54112333,  1.0685014 , -0.25909323, -0.8431133 ,  0.97710973,
       -0.20312196, -0.5276526 , -0.5820654 , -0.5117101 ,  0.18762219,
       -0.34834078, -0.72800124,  0.28341955,  0.16523412,  0.42910457,
       -0.5118524 ,  0.32897094,  0.11907633, -0.20873548, -0.33772415,
        0.4324073 , -0.4987982 , -0.25987038, -0.28694206, -0.46506748,
        0.01246771,  1.0905099 ,  0.45066744,  0.49298775,  0.24635513,
        0.14903821, -0.7840744 , -0.5718523 ,  0.5324328 ,  0.39959148,
       -0.590829  ,  0.7384237 , -0.5569704 ,  0.3998689 , -0.66

In [71]:
w2i['conversely']

KeyError: 'conversely'

In [41]:
bla(1,2,3,4,5)

1 2
(3, 4, 5)


In [7]:
#RESULT    find.v 71    show 0.34657
def read_eval_line(eval_line):
    eval_weights = []
    segments = eval_line.split("\t")
    #print(segments)
    instance_id = segments[1].strip()
    #print(instance_id)
    for candidate_weight in segments[2:]:
        if len(candidate_weight) > 0:
            delimiter_ind = candidate_weight.rfind(' ')
            candidate = candidate_weight[:delimiter_ind]
            weight = candidate_weight[delimiter_ind:]
            if ignore_mwe and ((len(candidate.split(' '))>1) or (len(candidate.split('-'))>1)):
                continue
            try:
                eval_weights.append((candidate, float(weight)))
            except:
                print("Error appending: %s %s" % (candidate, weight))

    return instance_id, eval_weights

['divide',
 'surface',
 'perspective',
 'aspect',
 'conversely',
 'other hand',
 'area',
 'dividing line',
 'flank',
 'instead',
 'you',
 'for us',
 'ally',
 'contingent',
 'faction',
 'hand',
 'part',
 'standpoint',
 'boundary',
 'bank',
 'fringe',
 'against',
 'facet',
 'from you',
 'shore',
 'edge',
 'responsibility',
 'team',
 'position',
 'view']

In [4]:
sess = tf.Session()
saver = tf.train.import_meta_graph('skip_checkpoints/text8.ckpt.meta')
saver.restore(sess, tf.train.latest_checkpoint('skip_checkpoints'))
graph = tf.get_default_graph()
embed_tensor = graph.get_tensor_by_name('embedding:0')



INFO:tensorflow:Restoring parameters from skip_checkpoints/text8.ckpt


In [5]:
bla = sess.run(embed_tensor)

In [3]:
os.listdir('lst')

['lst_test.gold',
 'lst.gold.candidates',
 'score.pl',
 'generalized_average_precision.py',
 'lst.wn.candidates',
 'lst_gap.py',
 'README',
 'gap-score-file',
 'lst_all.gold',
 'scoreFA.pl',
 'lst_test.preprocessed']

In [11]:
with open('skipgram_embedding.matrix','wb') as f:
    pickle.dump(bla,f)

1
