In [None]:
import os
import h5py
import pandas as pd
from tqdm import tqdm_notebook
import sys
import warnings
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [None]:
#dataset_type = 'train'
dataset_type = 'dev'
dataset_version = 'v1.1'

index_field = ['Unnamed: 0']

# required files
_basepath = '/home/jackalhan/Development/github/more_meaningful_representations/squad/'
datadir = os.path.join(_basepath, dataset_type)
modeldir = os.path.join(_basepath, 'model')

_qas_file_name = '{}_qas.csv'.format(dataset_type)
qas_file = os.path.join(datadir, _qas_file_name)

_embedding_mean_paragraph_file_as_h5py_name = 'elmo_{}_mean_paragraph_embeddings.hdf5'
embedding_mean_paragraph_file_as_h5py = os.path.join(datadir, _embedding_mean_paragraph_file_as_h5py_name)

_embedding_mean_question_file_as_h5py_name = 'elmo_{}_mean_question_embeddings.hdf5'
embedding_mean_question_file_as_h5py = os.path.join(datadir, _embedding_mean_question_file_as_h5py_name)

_cos_similarity_results_file_name =  '{}_cos_similarity_with_{}_norm_for_q_vs_para.csv'
cos_similarity_results_file_name = os.path.join(datadir, _cos_similarity_results_file_name)

_cos_similarity_results_as_hist_file_name =  'histogram_{}_cos_similarity_with_{}_norm_for_q_vs_para.png'
cos_similarity_results_as_hist_file = os.path.join(datadir, _cos_similarity_results_as_hist_file_name)


df_qas = pd.read_csv(qas_file).set_index(index_field)

In [None]:
dims = 1024
items = [dict({'type':'questions', 
                     'source_file':embedding_mean_question_file_as_h5py
                     }), 
              dict({'type':'paragraphs', 
                     'source_file':embedding_mean_paragraph_file_as_h5py,
                     })
              ]

for norm_type in ['l2']:
    print(10*'*', norm_type.upper(),'NORM', 10*'*')
    for vals in items:
        print(vals['type'], 'are getting processed!!!')
        vals['matrix'] = np.empty((0, dims), dtype=float)
        with h5py.File(vals['source_file'].format(norm_type), 'r') as fin:        
            for _ in tqdm_notebook(fin, total=len(fin)):             
                vec = np.reshape(np.array(fin[str(_)][...]), (1,dims))                                   
                vals['matrix'] = np.append(vals['matrix'], vec, axis=0)
            
    QUES = items[0]['matrix']
    print('QUES Shape', QUES.shape)
    PARA = items[1]['matrix']
    print('PARA Shape', PARA.shape)
    print('Similarities are getting calculated !!!')
    results = []
    for q_id, _ in enumerate(tqdm_notebook(QUES, total=len(QUES-1))):
        q_vec = np.array([_]) 
        sk_sim = cosine_similarity(q_vec,PARA)[0]
        actual_paragraph_id = df_qas[df_qas['Question_Id'] == q_id]['Paragraph_Id'].values[0]
        similarities = np.argsort(-sk_sim)
        order_of_the_actual_paragraph_id = np.where(similarities == actual_paragraph_id)[0][0] + 1
        calculated_most_similar_1_paragraph = similarities[0]
        results.append((q_id, actual_paragraph_id,  order_of_the_actual_paragraph_id, sk_sim[actual_paragraph_id], calculated_most_similar_1_paragraph, sk_sim[calculated_most_similar_1_paragraph]))
    
    df_results= pd.DataFrame(data=results, columns=['Question_Id', 'Actual_Paragraph_Id', 
                                                 'Order Index of Actual_Paragraph_Id in Similarities List',
                                                 'Similarity Score for Actual_Paragraph_Id',
                                                 'Calculated Top 1 Most Similar Paragraph', 
                                                 'Similarity Score for Most Similar Paragraph'
                                                ])
    df_results.to_csv(cos_similarity_results_file_name.format(dataset_type, norm_type), index=False)
    ax = df_results['Order Index of Actual_Paragraph_Id in Similarities List'].hist(bins=range(min(df_results['Order Index of Actual_Paragraph_Id in Similarities List']), max(df_results['Order Index of Actual_Paragraph_Id in Similarities List']) + 1, 1))
    fig = ax.get_figure()
    fig.savefig(cos_similarity_results_as_hist_file.format(dataset_type, norm_type))

print('Done!')