In [21]:
import os
import h5py
from tqdm import tqdm_notebook
import warnings
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
warnings.filterwarnings('ignore')

In [22]:
#dataset_type = 'train'
dataset_type = 'dev'
dataset_version = 'v1.1'
index_field = ['Unnamed: 0']

# required files
_basepath = '/home/jackalhan/Development/github/more_meaningful_representations/squad/'
datadir = os.path.join(_basepath, dataset_type)
modeldir = os.path.join(_basepath, 'model')

_embedding_paragraph_file_as_h5py_name = 'elmo_paragraph_embeddings.hdf5'
embedding_paragraph_file_as_h5py = os.path.join(datadir, _embedding_paragraph_file_as_h5py_name)

_embedding_question_file_as_h5py_name = 'elmo_question_embeddings.hdf5'
embedding_question_file_as_h5py = os.path.join(datadir, _embedding_question_file_as_h5py_name)

_embedding_mean_paragraph_file_as_h5py_name = 'elmo_mean_paragraph_embeddings.hdf5'
embedding_mean_paragraph_file_as_h5py = os.path.join(datadir, _embedding_mean_paragraph_file_as_h5py_name)

_embedding_mean_question_file_as_h5py_name = 'elmo_mean_question_embeddings.hdf5'
embedding_mean_question_file_as_h5py = os.path.join(datadir, _embedding_mean_question_file_as_h5py_name)

_qas_file_name = '{}_qas.csv'.format(dataset_type)
qas_file = os.path.join(datadir, _qas_file_name)

_cos_similarity_results_file_name =  '{}_cos_similarity_with_{}_norm_for_q_vs_para.csv'
cos_similarity_results_file_name = os.path.join(datadir, _cos_similarity_results_file_name)

_nearest_all_cos_similarity_results_file_name =  '{}_nearest_all_cos_similarity_with_{}_norm_for_q_vs_para.csv'
nearest_all_cos_similarity_results_file = os.path.join(datadir, _nearest_all_cos_similarity_results_file_name)

_cos_similarity_results_as_hist_file_name =  'histogram_{}_cos_similarity_with_{}_norm_for_q_vs_para.png'
cos_similarity_results_as_hist_file = os.path.join(datadir, _cos_similarity_results_as_hist_file_name)

_paragraphs_file_name_as_txt = '{}_paragraphs.txt'.format(dataset_type)
paragraphs_file_as_txt = os.path.join(datadir, _paragraphs_file_name_as_txt)

_questions_file_name_as_txt = '{}_questions.txt'.format(dataset_type)
questions_file_as_txt = os.path.join(datadir, _questions_file_name_as_txt)

df_qas = pd.read_csv(qas_file).set_index(index_field)


In [23]:
p_look_up = []
q_look_up = []
with open(paragraphs_file_as_txt, 'r') as fp_in,open(questions_file_as_txt, 'r') as fq_in:
    for i, line in enumerate(fp_in):
        p_look_up.append((i, line.replace('\n','')))
    for i, line in enumerate(fq_in):
        q_look_up.append((i, line.replace('\n','')))
df_p_look_up = pd.DataFrame(data=p_look_up, columns=['id', 'paragraph']).set_index('id')
df_q_look_up = pd.DataFrame(data=q_look_up, columns=['id', 'question']).set_index('id')

In [27]:
dims = 1024
items = [dict({'type':'Questions', 
                     'matrix': np.empty((0, dims), dtype=float),
                     'source_file':embedding_question_file_as_h5py,
                     'destination_file': embedding_mean_question_file_as_h5py}), 
              dict({'type':'Paragraphs', 
                     'matrix': np.empty((0, dims), dtype=float),
                     'source_file':embedding_paragraph_file_as_h5py,
                     'destination_file': embedding_mean_paragraph_file_as_h5py})
              ]
for vals in items:
    print(vals['type'], 'are getting processed!!!')    
    with h5py.File(vals['source_file'], 'r') as fin, h5py.File(vals['destination_file'], 'w') as fout:        
        for _ in tqdm_notebook(fin, total=len(fin)):            
            vec = np.array(fin[str(_)][...])
            print(vec)            
            print('ayy')
            mean_vector = np.apply_over_axes(np.mean, vec, (0,1))
            print(mean_vector)
            print('uff')
            reshaped_mean_vector = np.reshape(mean_vector, (1,dims))
            print(reshaped_mean_vector)
            print('-------------')
            vals['matrix'] = np.append(vals['matrix'], reshaped_mean_vector, axis=0)
#         for i, _ in enumerate(tqdm_notebook(vals['matrix'], total=len(vals['matrix']))):
#                 ds = fout.create_dataset(
#                                 '{}'.format(i),
#                                 _.shape, dtype='float32',
#                                 data=_)  
    
print('Similarities are getting calculated !!!')
QUES = items[0]['matrix']
print('QUES Shape', QUES.shape)
PARA = items[1]['matrix']
print('PARA Shape', PARA.shape)
for norm_type in ['l2']:
    print(10*'*', norm_type.upper(),'NORM', 10*'*')
    results = []
    nearest_paragraphs = []
    for q_id, _ in enumerate(tqdm_notebook(QUES, total=len(QUES))):
        question = df_q_look_up[df_q_look_up.index == q_id].values[0][0]
        q_vec = np.array([_]) 
        if (norm_type =='l2'):
            sk_sim = cosine_similarity(q_vec,PARA)[0]
        else :
            q_ = normalize(q_vec, norm='l1', axis=1)
            p_ = normalize(PARA, norm='l1', axis=1)
            sk_sim = np.dot(q_, p_.T)[0]

        actual_paragraph_id = df_qas[df_qas['Question_Id'] == q_id]['Paragraph_Id'].values[0]
        similarities = np.argsort(-sk_sim)
        order_of_the_actual_paragraph_id = np.where(similarities == actual_paragraph_id)[0][0] + 1
        calculated_most_similar_1_paragraph = similarities[0]
        results.append((q_id, actual_paragraph_id,  
                        order_of_the_actual_paragraph_id, 
                        sk_sim[actual_paragraph_id], 
                        calculated_most_similar_1_paragraph, 
                        sk_sim[calculated_most_similar_1_paragraph]))
        for i, nearest_paragraph_id in enumerate(similarities[0:5]):
            nearest_paragraphs.append((question, 
                                       df_p_look_up[df_p_look_up.index == nearest_paragraph_id].values[0][0],
                                       i+1, 
                                       sk_sim[nearest_paragraph_id] ))

    df_nearest_paragraphs = pd.DataFrame(data=nearest_paragraphs, columns=['question', 'paragraph', 'nearest_order', 'cos_similarity'])
    df_nearest_paragraphs.to_csv(nearest_all_cos_similarity_results_file.format(dataset_type, norm_type), index=False)

    df_results= pd.DataFrame(data=results, columns=['Question_Id', 'Actual_Paragraph_Id', 
                                         'Order Index of Actual_Paragraph_Id in Similarities List',
                                         'Similarity Score for Actual_Paragraph_Id',
                                         'Calculated Top 1 Most Similar Paragraph', 
                                         'Similarity Score for Most Similar Paragraph'
                                        ])
    df_results.to_csv(cos_similarity_results_file_name.format(dataset_type, norm_type), index=False)
    ax = df_results['Order Index of Actual_Paragraph_Id in Similarities List'].hist()
    fig = ax.get_figure()
    fig.savefig(cos_similarity_results_as_hist_file.format(dataset_type, norm_type))

Questions are getting processed!!!


[[[-0.8259065   0.19364136 -0.28518218 ...  0.590693    0.05137825
   -0.09093938]]

 [[ 0.8167018  -0.2801803   0.3755971  ...  0.01790142  0.13703534
    0.51969975]]

 [[ 0.7945445  -0.22229692  0.5315265  ... -0.0280595   0.367819
    0.5319332 ]]]
ayy
[[[ 0.26177993 -0.10294529  0.20731382 ...  0.19351165  0.18541086
    0.3202312 ]]]
uff
[[ 0.26177993 -0.10294529  0.20731382 ...  0.19351165  0.18541086
   0.3202312 ]]
-------------
[[[-0.25635523  0.45522204  0.40113887 ... -0.45845458 -0.47125745
   -0.47618663]]

 [[ 0.39421466 -0.2669996   0.41345817 ...  0.3855992   0.21159127
    0.72238463]]

 [[ 0.21787846 -0.5786756   0.0660941  ...  0.24573655  0.14450322
    0.60579574]]]
ayy
[[[ 0.1185793  -0.13015106  0.2935637  ...  0.05762706 -0.03838765
    0.28399792]]]
uff
[[ 0.1185793  -0.13015106  0.2935637  ...  0.05762706 -0.03838765
   0.28399792]]
-------------
[[[-0.34636047  0.22640544 -0.08570883 ... -1.0614543  -0.43690923
    0.6313253 ]]

 [[ 0.53537     0.35064304  0

   0.23180257]]
-------------
[[[ 0.8119687   0.85612273  0.3293768  ...  0.14877313 -0.11275369
    0.35567313]]

 [[ 0.34776562 -0.07757763  0.86187655 ...  0.27514005 -0.4316073
    0.3280981 ]]

 [[ 0.23384216 -0.3582257   1.6552728  ...  0.31925178 -0.40150878
   -0.08779073]]]
ayy
[[[ 0.46452546  0.14010645  0.94884205 ...  0.24772166 -0.31528994
    0.19866018]]]
uff
[[ 0.46452546  0.14010645  0.94884205 ...  0.24772166 -0.31528994
   0.19866018]]
-------------
[[[-0.9053306   0.3151867   0.49663118 ... -0.22823447  0.33448258
   -0.06766074]]

 [[ 0.8631254  -0.53751045 -0.32506806 ...  0.5294463  -0.30590913
    0.30341136]]

 [[ 0.50124687 -0.7166698   0.01302952 ...  0.99192595  0.48534647
    0.02641159]]]
ayy
[[[ 0.15301389 -0.31299785  0.06153088 ...  0.43104592  0.17130665
    0.08738741]]]
uff
[[ 0.15301389 -0.31299785  0.06153088 ...  0.43104592  0.17130665
   0.08738741]]
-------------
[[[-0.21014531  1.0118142   0.42263913 ... -0.998052    0.43644702
   -0.55887   ]]

[[[-0.75158036  0.3836147  -0.28643066 ... -0.9328016  -0.17229983
    0.77939487]]

 [[ 0.29890934 -0.21550214 -0.12468352 ...  0.156547   -0.06605773
    0.21858805]]

 [[ 0.6563289  -0.38753197 -0.63600147 ... -0.13243763 -0.22019525
    0.33036324]]]
ayy
[[[ 0.06788597 -0.07313981 -0.34903857 ... -0.30289742 -0.15285094
    0.44278204]]]
uff
[[ 0.06788597 -0.07313981 -0.34903857 ... -0.30289742 -0.15285094
   0.44278204]]
-------------
[[[-0.25669065  0.09095695  0.5469918  ...  0.72749186  0.45538136
    0.5383809 ]]

 [[ 0.18068805 -0.35556966  0.59390277 ...  0.06405677  0.01621072
    0.15679124]]

 [[-0.43178985 -1.136986    0.51839644 ...  0.3427499   0.80879694
   -0.24996719]]]
ayy
[[[-0.16926415 -0.46719956  0.553097   ...  0.37809953  0.42679635
    0.14840166]]]
uff
[[-0.16926415 -0.46719956  0.553097   ...  0.37809953  0.42679635
   0.14840166]]
-------------
[[[ 0.09848654 -0.7764214   0.90015334 ... -0.22772883  0.9854671
    0.20182599]]

 [[ 0.31847948  0.16166857  

[[[ 0.81474894 -0.5587697   0.27217457 ... -0.37284583 -0.30916554
    0.52702296]]

 [[ 0.38129276  0.61776626  0.44995537 ...  0.3458403  -0.01796425
    0.14840302]]

 [[-0.06474924  1.1781502   0.12362093 ...  0.50064313  0.24572352
    0.05613561]]]
ayy
[[[ 0.3770975   0.41238225  0.28191695 ...  0.1578792  -0.02713542
    0.24385387]]]
uff
[[ 0.3770975   0.41238225  0.28191695 ...  0.1578792  -0.02713542
   0.24385387]]
-------------
[[[-0.44718015 -0.11505229 -0.92211694 ... -0.18044648 -0.18933576
   -0.56305075]]

 [[-0.5869497  -0.28286478  0.02640047 ... -0.05939831  0.13535513
    0.17947066]]

 [[-1.2983918  -0.6689843   0.07075165 ...  0.07367443  0.66422695
    0.1227817 ]]]
ayy
[[[-0.77750725 -0.35563377 -0.27498826 ... -0.05539012  0.20341544
   -0.0869328 ]]]
uff
[[-0.77750725 -0.35563377 -0.27498826 ... -0.05539012  0.20341544
  -0.0869328 ]]
-------------
[[[ 0.1980944   0.62874806  0.32790843 ... -0.00608124 -0.4846791
    0.1763828 ]]

 [[ 0.23504719 -0.17761101 -

[[[ 0.41930512 -0.58869654  0.38596937 ... -0.27874488  0.25818735
    0.22228435]]

 [[ 0.46802512  0.44450033 -0.6043503  ...  0.3020683  -0.25811863
    0.18082464]]

 [[-0.6259289   0.45826352 -0.5069939  ...  0.40601984  0.41989547
   -0.29216856]]]
ayy
[[[ 0.08713379  0.1046891  -0.2417916  ...  0.14311442  0.13998806
    0.03698014]]]
uff
[[ 0.08713379  0.1046891  -0.2417916  ...  0.14311442  0.13998806
   0.03698014]]
-------------
[[[-0.21056125 -0.28605065 -0.18912336 ... -0.6890296   0.23384687
   -0.12390494]]

 [[ 0.31549346  0.5700311   0.0588384  ...  0.43521568 -0.29629502
    0.36855567]]

 [[ 0.46396476  1.1657236  -1.0300292  ...  0.60697734 -0.31126297
    0.4114432 ]]]
ayy
[[[ 0.18963234  0.48323467 -0.38677135 ...  0.11772115 -0.12457037
    0.21869798]]]
uff
[[ 0.18963234  0.48323467 -0.38677135 ...  0.11772115 -0.12457037
   0.21869798]]
-------------
[[[ 0.2283901  -0.17525445 -1.3557543  ... -0.02327707 -0.54339385
   -0.28821886]]

 [[ 0.26836905 -0.05655807 

KeyboardInterrupt: 