In [17]:
import os
from collections import defaultdict
from tqdm import tqdm_notebook
import pandas as pd
import numpy as np
import h5py
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import sys
import spacy

nlp = spacy.blank("en")

def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]

In [4]:

# required files
_basepath = '/home/jackalhan/Development/github/more_meaningful_representations/squad/'
datadir = os.path.join(_basepath, dataset_type)

_paragraphs_file_name_as_txt = '{}_paragraphs.txt'
paragraphs_file_as_txt = os.path.join(datadir, _paragraphs_file_name_as_txt.format(dataset_type))
sanity_paragraphs_file_as_txt = os.path.join(datadir, _paragraphs_file_name_as_txt.format('sanity_' + dataset_type ))

_questions_file_name_as_txt = '{}_questions.txt'
questions_file_as_txt = os.path.join(datadir, _questions_file_name_as_txt.format(dataset_type))
sanity_questions_file_as_txt = os.path.join(datadir, _questions_file_name_as_txt.format('sanity_' + dataset_type ))

_voc_file_name_as_txt = '{}_voc.txt'
voc_file_name_as_txt = os.path.join(datadir, _voc_file_name_as_txt.format('sanity_' + dataset_type ))

# _voc_counter_file_name_as_txt = '{}_voc_counter.txt'
# voc_counter_file_name = os.path.join(datadir, _voc_counter_file_name_as_txt.format('sanity_' + dataset_type ))

_qas_file_name = '{}_qas.csv'.format('sanity_' + dataset_type )
qas_file = os.path.join(datadir, _qas_file_name)

_glove_embed_file_name = 'glove.6B.300d.txt'
glove_embed_file = os.path.join(datadir, _glove_embed_file_name)

_embedding_paragraph_file_as_h5py_name = 'elmo_paragraph_embeddings.hdf5'
embedding_paragraph_file_as_h5py = os.path.join(datadir, _embedding_paragraph_file_as_h5py_name)

_embedding_question_file_as_h5py_name = 'elmo_question_embeddings.hdf5'
embedding_question_file_as_h5py = os.path.join(datadir, _embedding_question_file_as_h5py_name)

_embedding_mean_paragraph_file_as_h5py_name = 'elmo_mean_paragraph_embeddings.hdf5'
embedding_mean_paragraph_file_as_h5py = os.path.join(datadir, _embedding_mean_paragraph_file_as_h5py_name)

_embedding_mean_question_file_as_h5py_name = 'elmo_mean_question_embeddings.hdf5'
embedding_mean_question_file_as_h5py = os.path.join(datadir, _embedding_mean_question_file_as_h5py_name)

_cos_similarity_results_file_name =  '{}_cos_similarity_with_{}_norm_for_q_vs_para.csv'
cos_similarity_results_file_name = os.path.join(datadir, _cos_similarity_results_file_name)

_nearest_all_cos_similarity_results_file_name =  '{}_nearest_all_cos_similarity_with_{}_norm_for_q_vs_para.csv'
nearest_all_cos_similarity_results_file = os.path.join(datadir, _nearest_all_cos_similarity_results_file_name)

_cos_similarity_results_as_hist_file_name =  'histogram_{}_cos_similarity_with_{}_norm_for_q_vs_para.png'
cos_similarity_results_as_hist_file = os.path.join(datadir, _cos_similarity_results_as_hist_file_name)

df_qas = pd.read_csv(qas_file).set_index(index_field)

In [9]:
print('Obtaining paragraphs and questions')
paragraphs = []
questions = []
p_look_up = []
q_look_up = []
with open(sanity_paragraphs_file_as_txt, 'r') as fp_in, open(sanity_questions_file_as_txt, 'r') as fq_in:
    for i, line in enumerate(fp_in):
        paragraphs.append(line.replace('\n', ' '))
        p_look_up.append((i, line.replace('\n','')))
    for i, line in enumerate(fq_in):
        questions.append(line.replace('\n', ' '))
        q_look_up.append((i, line.replace('\n','')))
print('Done')
df_p_look_up = pd.DataFrame(data=p_look_up, columns=['id', 'paragraph']).set_index('id')
df_q_look_up = pd.DataFrame(data=q_look_up, columns=['id', 'question']).set_index('id')
tokenized_paragraphs = [word_tokenize(_) for _ in paragraphs]
tokenized_questions = [word_tokenize(_) for _ in questions]

Obtaining paragraphs and questions
Done


In [8]:
def load_glove_weights(gloveFile):
    print ("Loading Glove Model")
    with open(gloveFile, encoding="utf8" ) as f:
        content = f.readlines()
    model = {}
    for line in content:
        try:
            splitLine = line.split()
            #print(splitLine)
            word = splitLine[0]
            #print(word)
            
            embedding = np.array([float(val) for val in splitLine[1:]])
            model[word] = embedding
        except:
            print(word)
            print(splitLine[1:])
    print ("Done.",len(model)," words loaded!")
    return model
     
     
weights = load_glove_weights(glove_embed_file)

Loading Glove Model
Done. 400000  words loaded!


In [10]:
def analyze_embeddings(embedding_type):
    if embedding_type == 'glove':
        dims = 300
    else:
        dims = 1024
        
    items = [dict({'type':'Questions', 
                         'matrix': np.empty((0, dims), dtype=float),
                        'source':tokenized_questions,
                         'destination_file': embedding_mean_question_file_as_h5py}), 
                  dict({'type':'Paragraphs', 
                         'matrix': np.empty((0, dims), dtype=float),
                        'source': tokenized_paragraphs,
                         'destination_file': embedding_mean_paragraph_file_as_h5py})
                  ]
    for vals in items:
        print(vals['type'], 'are getting processed!!!')    
        with h5py.File(vals['destination_file'], 'w') as fout:        
            for _ in tqdm_notebook(vals['source'], total=len(vals['source'])):
                if embedding_type == 'glove':
                    try:
                        vec = np.array(weights[_[0]])
                    except:
                        vec = np.array(weights['unk'])
                else:
                    
                #print(vec.shape)            
                reshaped_vector = np.reshape(vec, (1,dims))
                #print(reshaped_vector.shape)
                vals['matrix'] = np.append(vals['matrix'], reshaped_vector, axis=0)
            for i, _ in enumerate(tqdm_notebook(vals['matrix'], total=len(vals['matrix']))):
                    ds = fout.create_dataset(
                                    '{}'.format(i),
                                    _.shape, dtype='float32',
                                    data=_)  

    print('Similarities are getting calculated !!!')
    QUES = items[0]['matrix']
    print('QUES Shape', QUES.shape)
    PARA = items[1]['matrix']
    print('PARA Shape', PARA.shape)
    for norm_type in ['l1', 'l2']:
        print(10*'*', norm_type.upper(),'NORM', 10*'*')
        results = []
        nearest_paragraphs = []
        for q_id, _ in enumerate(tqdm_notebook(QUES, total=len(QUES))):
            question = df_q_look_up[df_q_look_up.index == q_id].values[0][0]
            q_vec = np.array([_]) 
            if norm_type =='l2':
                sk_sim = cosine_similarity(q_vec,PARA)[0]
            else:
                q_ = normalize(q_vec, norm='l1', axis=1)
                p_ = normalize(PARA, norm='l1', axis=1)
                sk_sim = np.dot(q_, p_.T)[0]

            actual_paragraph_id = df_qas[df_qas['Question_Id'] == q_id]['Paragraph_Id'].values[0]
            similarities = np.argsort(-sk_sim)
            order_of_the_actual_paragraph_id = np.where(similarities == actual_paragraph_id)[0][0] + 1
            calculated_most_similar_1_paragraph = similarities[0]
            results.append((q_id, actual_paragraph_id,  
                            order_of_the_actual_paragraph_id, 
                            sk_sim[actual_paragraph_id], 
                            calculated_most_similar_1_paragraph, 
                            sk_sim[calculated_most_similar_1_paragraph]))
            for i, nearest_paragraph_id in enumerate(similarities[0:5]):
                nearest_paragraphs.append((question, 
                                           df_p_look_up[df_p_look_up.index == nearest_paragraph_id].values[0][0],
                                           i+1, 
                                           sk_sim[nearest_paragraph_id] ))

        df_nearest_paragraphs = pd.DataFrame(data=nearest_paragraphs, columns=['question', 'paragraph', 'nearest_order', 'cos_similarity'])
        df_nearest_paragraphs.to_csv(nearest_all_cos_similarity_results_file.format(dataset_type, norm_type), index=False)

        df_results= pd.DataFrame(data=results, columns=['Question_Id', 'Actual_Paragraph_Id', 
                                             'Order Index of Actual_Paragraph_Id in Similarities List',
                                             'Similarity Score for Actual_Paragraph_Id',
                                             'Calculated Top 1 Most Similar Paragraph', 
                                             'Similarity Score for Most Similar Paragraph'
                                            ])
        df_results.to_csv(cos_similarity_results_file_name.format(dataset_type, norm_type), index=False)
        ax = df_results['Order Index of Actual_Paragraph_Id in Similarities List'].hist()
        fig = ax.get_figure()
        fig.savefig(cos_similarity_results_as_hist_file.format(dataset_type, norm_type))

Questions are getting processed!!!






Paragraphs are getting processed!!!






Similarities are getting calculated !!!
QUES Shape (10000, 300)
PARA Shape (10000, 300)
********** L1 NORM **********



********** L2 NORM **********



