In [None]:
import os
import h5py
from tqdm import tqdm_notebook
import warnings
import numpy as np
from sklearn.preprocessing import normalize
warnings.filterwarnings('ignore')

In [None]:
#dataset_type = 'train'
dataset_type = 'dev'
dataset_version = 'v1.1'
index_field = ['Unnamed: 0']

# required files
_basepath = '/home/jackalhan/Development/github/more_meaningful_representations/squad/'
datadir = os.path.join(_basepath, dataset_type)
modeldir = os.path.join(_basepath, 'model')

_embedding_paragraph_file_as_h5py_name = 'elmo_paragraph_embeddings.hdf5'
embedding_paragraph_file_as_h5py = os.path.join(datadir, _embedding_paragraph_file_as_h5py_name)

_embedding_question_file_as_h5py_name = 'elmo_question_embeddings.hdf5'
embedding_question_file_as_h5py = os.path.join(datadir, _embedding_question_file_as_h5py_name)

_embedding_mean_paragraph_file_as_h5py_name = 'elmo_mean_paragraph_embeddings.hdf5'
embedding_mean_paragraph_file_as_h5py = os.path.join(datadir, _embedding_mean_paragraph_file_as_h5py_name)

_embedding_mean_question_file_as_h5py_name = 'elmo_mean_question_embeddings.hdf5'
embedding_mean_question_file_as_h5py = os.path.join(datadir, _embedding_mean_question_file_as_h5py_name)

_qas_file_name = '{}_qas.csv'.format(dataset_type)
qas_file = os.path.join(datadir, _qas_file_name)

_cos_similarity_results_file_name =  '{}_cos_similarity_with_{}_norm_for_q_vs_para.csv'
cos_similarity_results_file_name = os.path.join(datadir, _cos_similarity_results_file_name)

_nearest_all_cos_similarity_results_file_name =  '{}_nearest_all_cos_similarity_with_{}_norm_for_q_vs_para.csv'
nearest_all_cos_similarity_results_file = os.path.join(datadir, _nearest_all_cos_similarity_results_file_name)

_cos_similarity_results_as_hist_file_name =  'histogram_{}_cos_similarity_with_{}_norm_for_q_vs_para.png'
cos_similarity_results_as_hist_file = os.path.join(datadir, _cos_similarity_results_as_hist_file_name)

_paragraphs_file_name_as_txt = '{}_paragraphs.txt'.format(dataset_type)
paragraphs_file_as_txt = os.path.join(datadir, _paragraphs_file_name_as_txt)

_questions_file_name_as_txt = '{}_questions.txt'.format(dataset_type)
questions_file_as_txt = os.path.join(datadir, _questions_file_name_as_txt)

df_qas = pd.read_csv(qas_file).set_index(index_field)


In [None]:
p_look_up = []
q_look_up = []
with open(paragraphs_file_as_txt, 'r') as fp_in,open(questions_file_as_txt, 'r') as fq_in:
    for i, line in enumerate(fp_in):
        p_look_up.append((i, line.replace('\n','')))
    for i, line in enumerate(fq_in):
        q_look_up.append((i, line.replace('\n','')))
df_p_look_up = pd.DataFrame(data=p_look_up, columns=['id', 'paragraph']).set_index('id')
df_q_look_up = pd.DataFrame(data=q_look_up, columns=['id', 'question']).set_index('id')

In [None]:
dims = 1024
items = [dict({'type':'Questions', 
                     'matrix': np.empty((0, dims), dtype=float),
                     'source_file':embedding_question_file_as_h5py,
                     'destination_file': embedding_mean_question_file_as_h5py}), 
              dict({'type':'Paragraphs', 
                     'matrix': np.empty((0, dims), dtype=float),
                     'source_file':embedding_paragraph_file_as_h5py,
                     'destination_file': embedding_mean_paragraph_file_as_h5py})
              ]
for vals in items:
    print(vals['type'], 'are getting processed!!!')    
    with h5py.File(vals['source_file'], 'r') as fin, 
    h5py.File(vals['destination_file'], 'w') as fout:        
        for _ in tqdm_notebook(fin, total=len(fin)):            
            vec = np.array(fin[str(_)][...])
            #print(vec.shape)            
            mean_vector = np.apply_over_axes(np.mean, vec, (0, 1))
            #print(mean_vector.shape)
            reshaped_mean_vector = np.reshape(mean_vector, (1,dims))
            #print(reshaped_mean_vector.shape)
            vals['matrix'] = np.append(vals['matrix'], reshaped_mean_vector, axis=0)
        for i, _ in enumerate(tqdm_notebook(vals['matrix'], total=len(vals['matrix']))):
                ds = fout.create_dataset(
                                '{}'.format(i),
                                _.shape, dtype='float32',
                                data=_)  
    
print('Similarities are getting calculated !!!')
QUES = items[0]['matrix']
print('QUES Shape', QUES.shape)
PARA = items[1]['matrix']
print('PARA Shape', PARA.shape)
for norm_type in ['l1', 'l2']:
    print(10*'*', norm_type.upper(),'NORM', 10*'*')
    results = []
    nearest_paragraphs = []
    for q_id, _ in enumerate(tqdm_notebook(QUES, total=len(QUES))):
        question = df_q_look_up[df_q_look_up.index == q_id].values[0][0]
        q_vec = np.array([_]) 
        if (norm_type =='l2')
            sk_sim = cosine_similarity(q_vec,PARA)[0]
        else 
            q_ = normalize(q_vec, norm='l1', axis=1)
            p_ = normalize(PARA, norm='l1', axis=1)
            sk_sim = np.dot(q_, p_.T)[0]

        actual_paragraph_id = df_qas[df_qas['Question_Id'] == q_id]['Paragraph_Id'].values[0]
        similarities = np.argsort(-sk_sim)
        order_of_the_actual_paragraph_id = np.where(similarities == actual_paragraph_id)[0][0] + 1
        calculated_most_similar_1_paragraph = similarities[0]
        results.append((q_id, actual_paragraph_id,  
                        order_of_the_actual_paragraph_id, 
                        sk_sim[actual_paragraph_id], 
                        calculated_most_similar_1_paragraph, 
                        sk_sim[calculated_most_similar_1_paragraph]))
        for i, nearest_paragraph_id in enumerate(similarities[0:5]):
            nearest_paragraphs.append((question, 
                                       df_p_look_up[df_p_look_up.index == nearest_paragraph_id].values[0][0],
                                       i+1, 
                                       sk_sim[nearest_paragraph_id] ))

    df_nearest_paragraphs = pd.DataFrame(data=nearest_paragraphs, columns=['question', 'paragraph', 'nearest_order', 'cos_similarity'])
    df_nearest_paragraphs.to_csv(nearest_all_cos_similarity_results_file.format(dataset_type, norm_type), index=False)

    df_results= pd.DataFrame(data=results, columns=['Question_Id', 'Actual_Paragraph_Id', 
                                         'Order Index of Actual_Paragraph_Id in Similarities List',
                                         'Similarity Score for Actual_Paragraph_Id',
                                         'Calculated Top 1 Most Similar Paragraph', 
                                         'Similarity Score for Most Similar Paragraph'
                                        ])
    df_results.to_csv(cos_similarity_results_file_name.format(dataset_type, norm_type), index=False)
    ax = df_results['Order Index of Actual_Paragraph_Id in Similarities List'].hist()
    fig = ax.get_figure()
    fig.savefig(cos_similarity_results_as_hist_file.format(dataset_type, norm_type))