### LIBRARIES :

In [None]:
from allennlp.commands.elmo import ElmoEmbedder
import spacy
import json
import os
import warnings
import pandas as pd
from tqdm import tqdm_notebook
import numpy as np
import h5py
from collections import Counter
warnings.filterwarnings('ignore')

### FILE PATHS :

In [None]:
execute_as_fake = True

#dataset_type = 'train'
dataset_type = 'dev'
dataset_version = 'v1.1'


_basepath = '/home/jackalhan/Development/github/more_meaningful_representations/squad/'
_options_file_name = 'elmo_2x4096_512_2048cnn_2xhighway_weights.json'
_weight_file_name = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'
_vocab_file_name = '{}_voc.txt'.format(dataset_type)
_embedding_paragraph_file_as_h5py_name = 'elmo_paragraph_embeddings.hdf5'
_embedding_question_file_as_h5py_name = 'elmo_question_embeddings.hdf5'
_paragraphs_file_name_as_txt = '{}_paragraphs.txt'
_questions_file_name_as_txt = '{}_questions.txt'
_nearest_all_cos_similarity_results_file_name =  '{}_slice_{}_nearest_all_cos_similarity.csv'

datadir = os.path.join(_basepath, dataset_type)
modeldir = os.path.join(_basepath, 'model')
_squad_file_name = '{}-{}.json'.format(dataset_type, dataset_version)
squad_file = os.path.join(datadir, _squad_file_name)
vocab_file = os.path.join(datadir, _vocab_file_name)
options_file = os.path.join(modeldir, _options_file_name)
weight_file = os.path.join(modeldir, _weight_file_name)
embedding_paragraph_file_as_h5py = os.path.join(datadir, _embedding_paragraph_file_as_h5py_name)
embedding_question_file_as_h5py = os.path.join(datadir, _embedding_question_file_as_h5py_name)
paragraphs_file_as_txt = os.path.join(datadir, _paragraphs_file_name_as_txt.format(dataset_type))
questions_file_as_txt = os.path.join(datadir, _questions_file_name_as_txt.format(dataset_type))
nearest_all_cos_similarity_results_file = os.path.join(datadir, _nearest_all_cos_similarity_results_file_name)



### UTILITIES :

In [None]:
nlp = spacy.blank("en")
index_field = ['Unnamed: 0']
def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]
    


def read_squad_data(squad_file_path):
    #Read Dataset From Json File
    with open(squad_file, 'r') as _squad:
        squad = json.load(_squad)
    # Parse, titles and contents from the data    
    paragraphs = []
    questions = []
    _i_para, _i_qas = 0, 0
    for _i_titles, _titles in enumerate(squad['data']):
        for _paragraph in _titles['paragraphs']:
            paragraphs.append(_paragraph['context'].replace('\n', ' '))
            for _qas in _paragraph['qas']:
                questions.append(_qas['question'].replace('\n', ' '))                                  
                _i_qas += 1
            _i_para+=1
    return paragraphs, questions

def read_fake_data(paragraphs_file_path, 
                   questions_file_path):
    paragraphs = []
    questions = []
    with open(paragraphs_file_path, 'r') as fp_in, open(questions_file_path, 'r') as fq_in:
        for i, line in enumerate(fp_in):
            paragraphs.append(line.replace('\n', ' '))            
        for i, line in enumerate(fq_in):
            questions.append(line.replace('\n', ' '))            
    return paragraphs, questions
    

def tokenize_contexts(contexts:list):
    tokenized_context = [word_tokenize(_) for _ in contexts]
    return tokenized_context

def dump_tokenized_contexts(tokenized_contexts:list, file_path:str):
    with open(file_path, 'w') as fout:
        for context in tokenized_contexts:
            fout.write(' '.join(context) + '\n')
        fout.write('\n')
def create_voc(tokenized_contexts:list):
    all_tokens = set(['<S>', '</S>', '<UNK>'])
    for context in tokenized_contexts:
        for token in context:
            all_tokens.add(token)
    return all_tokens

def dump_voc(vocs:set, file_path:str):
    with open(file_path, 'w') as fout:
        fout.write('\n'.join(vocs))
        fout.write('\n')
        
def create_and_dump_embeddings(embedder, 
                               tokenized_contexts_file_path:str, 
                               file_path_to_dump:str,
                               embed_type='all'):
    """
    Parameters
    -----------
    output_format : ``str``, optional, (default = "all")
             The embeddings to output.  Must be one of "all", "top", or "average".
    """
    with open(tokenized_contexts_file_path, 'r') as fin:         
        ee.embed_file(fin, file_path_to_dump,output_format=embed_type)
        
def read_embeddings(embeddings_file_path, slice_index=None, axis=(0,1)):    
    embeddings_=[]
    with h5py.File(embeddings_file_path, 'r') as fin:        
        print('Embeddings are getting processed!')
        for _ in tqdm_notebook(fin, total=len(fin)):                        
            vec = np.array(fin[_][...])            
            if slice_index is not None:
                vec = vec[slice_index]
            mean_vector = np.apply_over_axes(np.mean, vec, axis)
            embeddings_.append(mean_vector) 
    embeddings = np.asarray(embeddings_)
    return embeddings 

def finding_nearest_neighbors(embedded_paragraphs_means, 
                              embedded_questions_means, 
                              questions, 
                              paragraphs,
                              norm_type='l2'):
    from sklearn.preprocessing import normalize
    from sklearn.metrics.pairwise import cosine_similarity
    print('Similarities are getting calculated !')   
    nearest_neighbors = []
    for q_id, _ in enumerate(tqdm_notebook(embedded_questions_means, total=len(embedded_questions_means))):
        question = questions[q_id]
        q_vec = np.array([_]) 
        if norm_type =='l2':
            sk_sim = cosine_similarity(q_vec,embedded_paragraphs_means)[0]
        else :
            q_ = normalize(q_vec, norm='l1', axis=1)
            p_ = normalize(embedded_paragraphs_means, norm='l1', axis=1)
            sk_sim = np.dot(q_, p_.T)[0]
        
        similarities = np.argsort(-sk_sim)
        order_of_the_actual_paragraph_id = np.where(similarities == q_id)[0][0] + 1
        calculated_most_similar_1_paragraph = similarities[0]
        for i, nearest_paragraph_id in enumerate(similarities[0:5]):
            nearest_neighbors.append((question,
                                       paragraphs[nearest_paragraph_id],
                                       i+1, 
                                       sk_sim[nearest_paragraph_id] ))
    return nearest_neighbors

def dump_nearest_neighbors(nearest_neighbors:list, file_path:str):
    df_nearest_neighbors = pd.DataFrame(data=nearest_neighbors, 
                                         columns=['question', 
                                                  'paragraph', 
                                                  'nearest_order', 
                                                  'cos_similarity'])
    df_nearest_neighbors.to_csv(file_path, index=False)

def traverse(o, tree_types=(list, tuple)):
    if isinstance(o, tree_types):
        for value in o:
            for subvalue in traverse(value, tree_types):
                yield subvalue
    else:
        yield o

# PIPELINE

#### READ DATA:

In [None]:
paragraphs, questions = read_squad_data(squad_file)

#### CREATE AND DUMP TOKENS:

In [None]:
tokenized_paragraphs = tokenize_contexts(paragraphs)
dump_tokenized_contexts(tokenized_paragraphs, paragraphs_file_as_txt)

tokenized_questions= tokenize_contexts(questions)
dump_tokenized_contexts(tokenized_questions, questions_file_as_txt)

#### CREATE AND DUMP VOCABULARY:

In [None]:
vocs = create_voc(tokenized_paragraphs + tokenized_questions)
dump_voc(vocs, vocab_file)

#### FAKESET CREATION IF execution_type == 'fake'

In [None]:
if execute_as_fake:
    token_counts = Counter([token for token in traverse(tokenized_paragraphs + tokenized_questions)])
    token_counts = [(k, token_counts[k]) for k in sorted(token_counts, key=token_counts.get, reverse=True)]
    print('Total length of tokens: {}'.format(len(token_counts)))
    sanity_tokens = token_counts[100:5100]
    print('Taking {} tokens from the list'.format(len(sanity_tokens)))
    sanity_tokens = [[k] for k, v in sanity_tokens]
    dump_tokenized_contexts(sanity_tokens, paragraphs_file_as_txt)
    dump_tokenized_contexts(sanity_tokens, questions_file_as_txt)
    paragraphs, questions = read_fake_data(paragraphs_file_as_txt, questions_file_as_txt)
    dump_voc([k for k in traverse(sanity_tokens)], vocab_file)

#### DUMP EMBEDDINGS :

In [None]:
#INITIALIZE ELMO EMBEDDER
ee = ElmoEmbedder(options_file, weight_file)

Important Note: Before execute the following line to create embeddings as a batch, 
you need to make changes in the **embed_file** function of **elmo.py** file of the codes.
The reason of doing it is that, instead of create datasets with token names in H5PY file, I am using indexes to store datasets in the file, therefore here is the small modification: 

**-> Line:285**

**Original Code:**

```python
for key, embeddings in Tqdm.tqdm(embedded_sentences):
    ...
    ...
    fout.create_dataset(key,
                        output.shape, dtype='float32',
                        data=output)
```

**Updated Code:**

```python
for i, embeddings_ in enumerate(Tqdm.tqdm(embedded_sentences)):
    key = embeddings_[0]
    embeddings = embeddings_[1]
    ...
    ...
    fout.create_dataset(str(i),
                        output.shape, dtype='float32',
                        data=output)
```


In [None]:
create_and_dump_embeddings(ee, paragraphs_file_as_txt, embedding_paragraph_file_as_h5py)
create_and_dump_embeddings(ee, questions_file_as_txt, embedding_question_file_as_h5py)

##  EMBEDDINGS:
#### SLICE CONFIGURATION

In [None]:
dims = 1024
slices = [{'slice_type':'All',
              'slice_index':None,
              'axis':(0,1)},
          {'slice_type':'1st',
              'slice_index':0,
              'axis':(0)},
          {'slice_type':'2nd',
              'slice_index':1,
              'axis':(0)},
          {'slice_type':'3rd',
              'slice_index':2,
              'axis':(0)}]

selected_slice_conf =slices[0]

print('Embeddings will be executed by the following configs: \n{}'.format(selected_slice_conf))

#### READ EMBEDDINGS

In [None]:
# -------------------------- Paragraphs
embedded_paragraphs = read_embeddings(embedding_paragraph_file_as_h5py, 
                                      selected_slice_conf['slice_index'], 
                                      selected_slice_conf['axis'])
print('Paragraphs shape', embedded_paragraphs.shape)
embedded_paragraphs_means_with_all_slices = np.reshape(embedded_paragraphs, 
                                                      (embedded_paragraphs.shape[0], dims))
print('Paragraphs shape', embedded_paragraphs_means_with_all_slices.shape)
# -------------------------- Questions
embedded_questions = read_embeddings(embedding_question_file_as_h5py,
                                     selected_slice_conf['slice_index'], 
                                     selected_slice_conf['axis'])
print('Questions shape', embedded_questions.shape)
embedded_questions_means_with_all_slices = np.reshape(embedded_questions, 
                                                      (embedded_questions.shape[0], dims))
print('Questions shape', embedded_questions_means_with_all_slices.shape)

### FIND and DUMP NEAREST NEIGHBORS :

In [None]:
nearest_neighbors = finding_nearest_neighbors(embedded_paragraphs_means_with_all_slices, 
                          embedded_questions_means_with_all_slices, 
                          questions, 
                          paragraphs,
                          norm_type='l2')

dump_nearest_neighbors(nearest_neighbors, 
                       nearest_all_cos_similarity_results_file.format(dataset_type,selected_slice_conf['slice_type']))

In [None]:
len(paragraphs)