In [None]:
import os
import h5py
from tqdm import tqdm_notebook
import warnings
import numpy as np
from sklearn.preprocessing import normalize
warnings.filterwarnings('ignore')

In [None]:
#dataset_type = 'train'
dataset_type = 'dev'
dataset_version = 'v1.1'
index_field = ['Unnamed: 0']

# required files
_basepath = '/home/jackalhan/Development/github/more_meaningful_representations/squad/'
datadir = os.path.join(_basepath, dataset_type)
modeldir = os.path.join(_basepath, 'model')
_embedding_paragraph_file_as_h5py_name = 'elmo_paragraph_embeddings.hdf5'
embedding_paragraph_file_as_h5py = os.path.join(datadir, _embedding_paragraph_file_as_h5py_name)

_embedding_question_file_as_h5py_name = 'elmo_question_embeddings.hdf5'
embedding_question_file_as_h5py = os.path.join(datadir, _embedding_question_file_as_h5py_name)

_embedding_mean_paragraph_file_as_h5py_name = 'elmo_{}_mean_paragraph_embeddings.hdf5'
embedding_mean_paragraph_file_as_h5py = os.path.join(datadir, _embedding_mean_paragraph_file_as_h5py_name)

_embedding_mean_question_file_as_h5py_name = 'elmo_{}_mean_question_embeddings.hdf5'
embedding_mean_question_file_as_h5py = os.path.join(datadir, _embedding_mean_question_file_as_h5py_name)


In [None]:
dims = 1024
items = [dict({'type':'Questions', 
                     'matrix': np.empty((0, dims), dtype=float),
                     'source_file':embedding_question_file_as_h5py,
                     'destination_file': embedding_mean_question_file_as_h5py}), 
              dict({'type':'Paragraphs', 
                     'matrix': np.empty((0, dims), dtype=float),
                     'source_file':embedding_paragraph_file_as_h5py,
                     'destination_file': embedding_mean_paragraph_file_as_h5py})
              ]
for vals in items:
    print(vals['type'], 'are getting processed!!!')    
    with h5py.File(vals['source_file'], 'r') as fin:        
        for _ in tqdm_notebook(fin, total=len(fin)):             
            vec = np.array(fin[str(_)][...])
            #print(vec.shape)            
            mean_vector = np.apply_over_axes(np.mean, vec, (0, 1))
            #print(mean_vector.shape)
            reshaped_mean_vector = np.reshape(mean_vector, (1,dims))
            #print(reshaped_mean_vector.shape)
            vals['matrix'] = np.append(vals['matrix'], reshaped_mean_vector, axis=0)
                

In [None]:
for norm_type in ['l1', 'l2']:
    print(10*'*', norm_type.upper(),'NORM', 10*'*')
    QUES_Mean = items[0]['matrix']
    PARA_Mean = items[1]['matrix']
    #QUES_Norms = QUES_Mean / np.linalg.norm(QUES_Mean, ord=2)
    QUES_Norms = normalize(QUES_Mean, norm=norm_type)
    PARA_Norms = normalize(PARA_Mean, norm=norm_type)
    items[0]['norm_matrix'] = QUES_Norms
    items[1]['norm_matrix'] = PARA_Norms
    for vals in items:
        print(vals['type'], 'are getting processed!!!')    
        with h5py.File(vals['destination_file'].format(norm_type), 'w') as fout: 
            for i, _ in enumerate(tqdm_notebook(vals['norm_matrix'], total=len(vals['norm_matrix']))):
                ds = fout.create_dataset(
                                '{}'.format(i),
                                _.shape, dtype='float32',
                                data=_)
        