### LIBRARIES :

In [None]:
from allennlp.commands.elmo import ElmoEmbedder
import os
import warnings
import pandas as pd
from tqdm import tqdm_notebook
import numpy as np
import h5py
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings('ignore')

### FILE PATHS :

In [None]:
_basepath = '/home/jackalhan/Development/github/more_meaningful_representations/squad/dev'
_input_file_name = 'input_words.txt'
_options_file_name = 'elmo_2x4096_512_2048cnn_2xhighway_weights.json'
_weight_file_name = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'
_embeddings_file_name = 'elmo_embeddings.hdf5'
_neighbor_words_file_name =  'nearest_words.csv'

datadir = os.path.join(_basepath)
options_file = os.path.join(datadir, _options_file_name)
weight_file = os.path.join(datadir, _weight_file_name)
embeddings = os.path.join(datadir, _embeddings_file_name)
neighbor_words = os.path.join(datadir, _neighbor_words_file_name)
input_file = os.path.join(datadir, _input_file_name)

### PIPELINE:

In [None]:
with open(input_file) as f:
    contents = f.readlines()
contents = [[content.strip()] for content in contents]
ee = ElmoEmbedder(options_file, weight_file)
embeddings = np.asarray(ee.embed_batch(contents))
slices = [{'slice_type':'All', 'slice_index':None, 'axis':(1,2)},
          {'slice_type':'1st', 'slice_index':0, 'axis':(1)},
          {'slice_type':'2nd', 'slice_index':1, 'axis':(1)},
          {'slice_type':'3rd', 'slice_index':2, 'axis':(1)}]
neighbor_list = []
for _s in slices:
    print('Processing : {}'.format(_s))
    embeddings_ = embeddings
    if _s['slice_index'] is not None:
        embeddings_ = embeddings[:,_s['slice_index']]        
    embeddings_ = np.apply_over_axes(np.mean, embeddings_, _s['axis'])
    embeddings_ = np.reshape(embeddings_,(embeddings.shape[0], embeddings.shape[3]))
    for _id, _embedding in enumerate(tqdm_notebook(embeddings_, total=len(embeddings_))):
        _embedding = np.array([_embedding]) 
        sk_sim = cosine_similarity(_embedding,embeddings_)[0]
        neighbors = np.argsort(-sk_sim)
        for _, neighbor_id in enumerate(neighbors[0:5]):
            neighbor_list.append((_s['slice_type'], contents[_id][0], contents[neighbor_id][0],_+1, sk_sim[neighbor_id]))
df_neighbors = pd.DataFrame(data=neighbor_list, columns=['slice_type','word', 'neighbor_word', 'neighbor_order', 'cos_similarity'])
df_neighbors.to_csv(neighbor_words, index=False)
print('Completed')