In [1]:
import datetime
import os
from collections import defaultdict
import pickle
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import json
import h5py
from sklearn.feature_extraction.text import TfidfVectorizer
from shutil import copyfile
nlp = spacy.blank("en")
encoding="utf-8"
tokenize = lambda doc: [token.text for token in nlp(doc)]
def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]

dataset_type = 'dev'
dataset_version = 'v1.1'

_basepath = '/home/jackalhan/Development/github/more_meaningful_representations/squad/'
datadir = os.path.join(_basepath, dataset_type)
pre_trained_dir = os.path.join(_basepath, 'GLOVE', 'data')
_paragraphs_file_name = '{}_glove_paragraphs.txt'.format(dataset_type)
paragraphs_file = os.path.join(datadir, _paragraphs_file_name)

_paragraph_embeddings_file_name = '{}_glove_paragraph_embeddings.hdf5'.format(dataset_type)
paragraph_embeddings_file = os.path.join(datadir, _paragraph_embeddings_file_name)

_token_embeddings_file_name = '{}_glove_token_embeddings.hdf5'.format(dataset_type)
token_embeddings_file= os.path.join(datadir, _token_embeddings_file_name )

_token_embeddings_guideline_file_name = '{}_glove_token_embeddings_guideline.pkl'.format(dataset_type)
token_embeddings_guideline_file = os.path.join(datadir, _token_embeddings_guideline_file_name)

_questions_file_name = '{}_glove_questions.txt'.format(dataset_type)
questions_file = os.path.join(datadir, _questions_file_name)

_question_embeddings_file_name = '{}_glove_question_embeddings.hdf5'.format(dataset_type)
question_embeddings_file = os.path.join(datadir, _question_embeddings_file_name)

_word_embeddings_file_name = '{}_glove_word_embeddings.hdf5'.format(dataset_type)
word_embeddings_file = os.path.join(datadir, _word_embeddings_file_name)

_neighbors_file_name = '{}_glove_neighbors.csv'.format(dataset_type)
neighbors_file = os.path.join(datadir, _neighbors_file_name)

_voc_file_name = '{}_voc.txt'.format(dataset_type)
voc_file_name = os.path.join(datadir, _voc_file_name)

_squad_file_name = '{}-{}.json'.format(dataset_type, dataset_version)
squad_file = os.path.join(datadir, _squad_file_name)

_squad_test_file_name = '{}-{}.json'.format('train', dataset_version)
squad_test_file = os.path.join(datadir, _squad_test_file_name)

_glove_file_name = 'GloVe.840B.300d.txt'
glove_file = os.path.join(pre_trained_dir,'GloVe.840B.300d', _glove_file_name)

def read_squad_data(squad_file_path):

    #Read Dataset From Json File
    with open(squad_file_path, 'r') as _squad:
        squad = json.load(_squad)
    # Parse, titles and contents from the data
    paragraphs = []
    questions = []
    question_to_paragraph = []
    _i_para, _i_qas = 0, 0
    for _i_titles, _titles in enumerate(squad['data']):
        for _paragraph in _titles['paragraphs']:
            paragraphs.append(_paragraph['context'])
            for _qas in _paragraph['qas']:
                questions.append(_qas['question'])
                question_to_paragraph.append(_i_para)
                _i_qas += 1
            _i_para+=1

    return paragraphs, questions, question_to_paragraph

def dump_tokenized_contexts(tokenized_contexts:list, file_path:str):
    with open(file_path, 'w') as fout:
        for context in tokenized_contexts:
            fout.write(' '.join(context) + '\n')

def tokenize_contexts(contexts:list):
    tokenized_context = [word_tokenize(context.strip()) for context in contexts]
    return tokenized_context

def calculate_similarity_and_dump(paragraphs_embeddings, questions_embeddings, slice_type, q_to_p, outfile):
    neighbor_list = []
    for _id, _q_embedding in enumerate(tqdm(questions_embeddings, total=len(questions_embeddings))):
        _q_embedding = np.array([_q_embedding])
        sk_sim = cosine_similarity(_q_embedding, paragraphs_embeddings)[0]
        neighbors = np.argsort(-sk_sim)
        for _, neighbor_id in enumerate(neighbors):
            neighbor_list.append((slice_type,
                                  _id,
                                  neighbor_id,
                                  _ + 1,
                                  sk_sim[neighbor_id],
                                  q_to_p[_id],
                                  np.where(neighbors == q_to_p[_id])[0][0] + 1,
                                  sk_sim[q_to_p[_id]]
                                  ))
    df_neighbors = pd.DataFrame(data=neighbor_list, columns=['slice_type',
                                                             'question',
                                                             'neighbor_paragraph',
                                                             'neighbor_order',
                                                             'neighbor_cos_similarity',
                                                             'actual_paragraph',
                                                             'actual_paragraph_order',
                                                             'actual_paragrraph_cos_similarity'
                                                             ])
    df_neighbors.to_csv(outfile, index=False)
    return df_neighbors
def read_file(file_name):
    with open(file_name) as f:
        content = f.readlines()
    # you may also want to remove whitespace characters like `\n` at the end of each line
    content = [x.strip() for x in content]
    return content

def dump_embeddings(embeddings, outfile_to_dump):
    with h5py.File(outfile_to_dump, 'w') as fout:
        ds = fout.create_dataset(
            'embeddings',
            embeddings.shape, dtype='float32',
            data=embeddings
        )

  from ._conv import register_converters as _register_converters


In [2]:
slices = [{'slice_type':'All', 'slice_index':None, 'axis':(1,2)},
          {'slice_type':'1st', 'slice_index':0, 'axis':(1)},
          {'slice_type':'2nd', 'slice_index':1, 'axis':(1)},
          {'slice_type':'3rd', 'slice_index':2, 'axis':(1)}]

s = slices[0] # option 1

In [2]:
dim=300
tokens = []
if voc_file_name is not None:
    tokens = read_file(voc_file_name)
glove_word_weights = {}
with open(glove_file, "rb") as infile:
    for line in infile:
        parts = line.split()
        token = parts[0].decode(encoding)
        if len(tokens) >0:
            if token in tokens:
                nums = np.array(parts[1:], dtype=np.float32)
                glove_word_weights[token] = nums
        else:
            nums = np.array(parts[1:], dtype=np.float32)
            glove_word_weights[token] = nums

In [3]:
print('Squad Data: Reading Dev Started')
start = datetime.datetime.now()
paragraphs, questions, q_to_p = read_squad_data(squad_file)
paragraphs_test, questions_test, q_to_p_test = read_squad_data(squad_test_file)
end = datetime.datetime.now()
print('# of Paragraphs : {}'.format(len(paragraphs)))
print('# of Questions : {}'.format(len(questions)))
print('# of Q_to_P : {}'.format(len(q_to_p)))
print('Squad Data: Reading Dev Ended in {} minutes'.format((end-start).seconds/60))

Squad Data: Reading Dev Started
# of Paragraphs : 2067
# of Questions : 10570
# of Q_to_P : 10570
Squad Data: Reading Dev Ended in 0.0 minutes


In [4]:
print(20* '-')
print('Paragraphs: Tokenization and Saving Tokenization Started')
start = datetime.datetime.now()
tokenized_paragraphs = tokenize_contexts(paragraphs)
tokenized_test_paragraphs = tokenize_contexts(paragraphs_test)
dump_tokenized_contexts(tokenized_paragraphs, paragraphs_file)
end = datetime.datetime.now()
print('# of Tokenized Paragraphs: {}'.format(len(tokenized_paragraphs)))
print('Paragraphs: Tokenization and Saving Tokenization  is Completed in {} minutes'.format((end-start).seconds/60))

--------------------
Paragraphs: Tokenization and Saving Tokenization Started
# of Tokenized Paragraphs: 2067
Paragraphs: Tokenization and Saving Tokenization  is Completed in 0.5833333333333334 minutes


In [5]:
print(20* '-')
print('Questions: Tokenization and Saving Tokenization Started')
start = datetime.datetime.now()
tokenized_questions = tokenize_contexts(questions)
tokenized_test_questions = tokenize_contexts(questions_test)
dump_tokenized_contexts(tokenized_questions,questions_file)
end = datetime.datetime.now()
print('# of Tokenized Questions: {}'.format(len(tokenized_questions)))
print('Questions: Tokenization and Saving Tokenization  is Completed in {} minutes'.format((end-start).seconds/60))

--------------------
Questions: Tokenization and Saving Tokenization Started
# of Tokenized Questions: 10570
Questions: Tokenization and Saving Tokenization  is Completed in 0.25 minutes


In [6]:
questions_nontokenized = [" ".join(context) for context in tokenized_questions]
paragraphs_nontokenized = [" ".join(context) for context in tokenized_paragraphs]
questions_test_nontokenized = [" ".join(context) for context in tokenized_test_questions]
paragraphs_test_nontokenized = [" ".join(context) for context in tokenized_test_paragraphs]

In [7]:
token_tfidf_weights = None
tfidf = TfidfVectorizer(analyzer=lambda x: x, smooth_idf=True, sublinear_tf=True, tokenizer=tokenize)
tfidf.fit(questions_nontokenized+paragraphs_nontokenized+questions_test_nontokenized+paragraphs_test_nontokenized)
max_idf = max(tfidf.idf_)
token2idfweight = defaultdict(
    lambda: max_idf,
    [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

In [8]:
tokenized_questions[0]

['Which',
 'NFL',
 'team',
 'represented',
 'the',
 'AFC',
 'at',
 'Super',
 'Bowl',
 '50',
 '?']

In [None]:
mean_glove_embeddings= np.array([
                                np.mean([glove_word_weights[w] for w in words if w in glove_word_weights]
                                or [np.zeros(dim)], axis=0)
                                for words in tokenized_questions+tokenized_paragraphs
                                ])
####################################################################################################################
##############       MEAN GLOVE EMBEDDINGS
####################################################################################################################
dump_embeddings(mean_glove_embeddings, os.path.join(datadir,'dev_mean_glove_embeddings.hdf5'))
question_embeddings = mean_glove_embeddings[0:len(tokenized_questions),:]
paragraphs_embeddings = mean_glove_embeddings[len(tokenized_questions):,:]
print('Nearest Neighbors: Starting')
start_ = datetime.datetime.now()
neighbors = calculate_similarity_and_dump(paragraphs_embeddings, question_embeddings, s['slice_type'], q_to_p,
                     os.path.join(datadir, 'dev_mean_glove_neighbors.csv'))
end_ = datetime.datetime.now()
print('Nearest Neighbors: Completed in {} minutes.'.format((end_-start_).seconds/60))


In [18]:
mean_glove_with_idf_embeddings =  np.array([
    np.mean([glove_word_weights[w] * token2idfweight[w]
             for w in words if w in glove_word_weights] or
            [np.zeros(dim)], axis=0)
    for words in tokenized_questions+tokenized_paragraphs
])
####################################################################################################################
##############       MEAN GLOVE WITH IDF EMBEDDINGS
####################################################################################################################
dump_embeddings(mean_glove_with_idf_embeddings, os.path.join(datadir,'dev_mean_glove_with_idf_embeddings.hdf5'))
question_embeddings = mean_glove_with_idf_embeddings[0:len(tokenized_questions),:]
paragraphs_embeddings = mean_glove_with_idf_embeddings[len(tokenized_questions):,:]
print('Nearest Neighbors: Starting')
start_ = datetime.datetime.now()
neighbors = calculate_similarity_and_dump(paragraphs_embeddings, question_embeddings, s['slice_type'], q_to_p,
                     os.path.join(datadir, 'dev_mean_glove_with_idf_neighbors.csv'))
end_ = datetime.datetime.now()
print('Nearest Neighbors: Completed in {} minutes.'.format((end_-start_).seconds/60))
end = datetime.datetime.now()

print('GLOVE + IDF Embeddings is completed in {} minutes'.format((end-start).seconds/60))

  0%|          | 6/10570 [00:00<03:29, 50.32it/s]

Nearest Neighbors: Starting


100%|██████████| 10570/10570 [04:26<00:00, 39.62it/s]


Nearest Neighbors: Completed in 8.0 minutes.
GLOVE + IDF Embeddings is completed in 42.4 minutes
