In [2]:
import re
import os
import sys
import random
import string
import logging
import argparse
import json
import torch
import msgpack
import spacy
import pandas as pd
import numpy as np
from shutil import copyfile
from datetime import datetime
from collections import Counter, defaultdict
import pickle
from src.model import DocReaderModel
from src.batcher import load_meta_with_vocab, BatchGen
from my_utils.data_utils import feature_func_eval
from my_utils.tokenizer import Vocabulary, reform_text
from config import set_args
from my_utils.utils import set_environment
from my_utils.log_wrapper import create_logger
from my_utils.squad_eval import evaluate
from my_utils.data_utils import predict_squad, gen_name, load_squad_v2_label, compute_acc
from my_utils.squad_eval_v2 import *
import sys
print(torch.cuda.is_available())
from my_utils.data_utils import predict_squad, gen_name, gen_gold_name, load_squad_v2_label, compute_acc

import math

True


## Setup model and vocab

In [3]:
sys.argv = ['train.py']
version = 'v2'

# -rwxrwxrwx 1 root root 1.2G Feb 20 20:22 cp_epoch_7_em_71_f1_72.pt
# -rwxrwxrwx 1 root root 9.0K Feb 20 20:22 dev_output_7.json
# -rwxrwxrwx 1 root root 1.2G Feb 20 19:52 cp_epoch_6_em_78_f1_78.pt
# -rwxrwxrwx 1 root root 7.6K Feb 20 19:51 dev_output_6.json
# -rwxrwxrwx 1 root root 1.2G Feb 20 19:20 cp_epoch_5_em_70_f1_71.pt
# -rwxrwxrwx 1 root root  11K Feb 20 19:20 dev_output_5.json
# -rwxrwxrwx 1 root root 1.2G Feb 20 18:47 cp_epoch_4_em_74_f1_75.pt
# -rwxrwxrwx 1 root root 8.0K Feb 20 18:47 dev_output_4.json
# -rwxrwxrwx 1 root root 1.2G Feb 20 18:15 cp_epoch_3_em_76_f1_76.pt
# -rwxrwxrwx 1 root root 8.5K Feb 20 18:14 dev_output_3.json
# -rwxrwxrwx 1 root root 1.2G Feb 20 17:40 cp_epoch_2_em_80_f1_80.pt
# -rwxrwxrwx 1 root root 7.0K Feb 20 17:40 dev_output_2.json
# -rwxrwxrwx 1 root root 1.2G Feb 20 17:05 cp_epoch_1_em_71_f1_72.pt
# -rwxrwxrwx 1 root root  10K Feb 20 17:05 dev_output_1.json
# -rwxrwxrwx 1 root root 1.2G Feb 20 16:29 best_epoch_0_em_81_f1_81.pt
# -rwxrwxrwx 1 root root 1.2G Feb 20 16:29 cp_epoch_0_em_81_f1_81.pt
# -rwxrwxrwx 1 root root 6.9K Feb 20 16:29 dev_output_0.json


model_filepath = 'checkpoint/cp_epoch_6_em_76_f1_76.pt'
args = set_args()

opt = vars(args)
print(args.cuda)

embedding, opt, vocab = load_meta_with_vocab(opt, gen_name(args.data_dir, args.meta, version, suffix='pick'))

# print(opt)
model = DocReaderModel.load(model_filepath,embedding,gpu=args.cuda)


model.setup_eval_embed(embedding)

if args.cuda:
    print('loading as cuda enabled')
    model.cuda()
else:
    model.cpu()

with open(os.path.join('resource', 'vocab_tag.pick'),'rb') as f:
    vocab_tag = pickle.load(f)
with open(os.path.join('resource','vocab_ner.pick'),'rb') as f:
    vocab_ner = pickle.load(f)
    
NLP = spacy.load('en')
print('Done loading model and resources')

True
loading as cuda enabled
Done loading model and resources


In [4]:
def position_encoding( m, threshold=5):
    encoding = np.ones((m, m), dtype=np.float32)
    for i in range(m):
        for j in range(i, m):
            if j - i > threshold:
                encoding[i][j] = float(1.0 / math.log(j - i + 1))
    return torch.from_numpy(encoding)

In [5]:
def predict_squad(model, data, v2_on=False):
    data.reset()
    span_predictions = {}
    label_predictions = {}
    for batch in data:
        phrase, spans, scores = model.predict(batch)
        uids = batch['uids']
        for uid, pred in zip(uids, phrase):
            span_predictions[uid] = pred
        if v2_on:
            for uid, pred in zip(uids, scores):
                label_predictions[uid] = pred
    return span_predictions, label_predictions


def load_squad(data_path):
    with open(data_path) as dataset_file:
        dataset_json = json.load(dataset_file)
        dataset = dataset_json['data']
        return dataset
    
def make_qid_to_has_ans(dataset):
    qid_to_has_ans = {}
    for article in dataset:
        for p in article['paragraphs']:
            for qa in p['qas']:
                qid_to_has_ans[qa['id']] = bool(qa['answers'])
    return qid_to_has_ans



def get_raw_scores(dataset, preds):
    exact_scores = {}
    f1_scores = {}
    for article in dataset:
        for p in article['paragraphs']:
            for qa in p['qas']:
                qid = qa['id']
                gold_answers = [a['text'] for a in qa['answers']
                                                if normalize_answer(a['text'])]
                if not gold_answers:
                    # For unanswerable questions, only correct answer is empty string
                    gold_answers = ['']
                if qid not in preds:
                    # print('Missing prediction for %s' % qid)
                    continue
                a_pred = preds[qid]
                # Take max over all gold answers
                exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
                f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
    return exact_scores, f1_scores
    
    
    
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))


def make_eval_dict(exact_scores, f1_scores, qid_list=None):
    if not qid_list:
        total = len(exact_scores)
        return collections.OrderedDict([
                ('exact', 100.0 * sum(exact_scores.values()) / total),
                ('f1', 100.0 * sum(f1_scores.values()) / total),
                ('total', total),
        ])
    else:
        total = len(qid_list)

    return collections.OrderedDict([
            ('exact', 100.0 * sum(exact_scores[k] for k in f1_scores if k in qid_list) / total),
            ('f1', 100.0 * sum(f1_scores[k] for k in f1_scores if k in qid_list) / total),
            ('total', total),
    ])

def my_evaluation(dataset, preds, na_probs=None, na_prob_thresh=1.0):
    has_na_prob_score = False if na_probs is None else True
    if na_probs is None:
            na_probs = {k: 0.0 for k in preds}
    qid_to_has_ans = make_qid_to_has_ans(dataset)    # maps qid to True/False
    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
    
#     print('number of hasans:',len(has_ans_qids))
#     print('number of noans:',len(no_ans_qids))
#     print('len of preds:',len(preds))
    
    
    #print golds
#     print([dataset[list(preds.keys()).index(k)]['paragraphs'][0]['qas'][0]['question'] for k,v in preds.items() ])

    exact_raw, f1_raw = get_raw_scores(dataset, preds)
#     print('exact/f1 raw',exact_raw,f1_raw)
    
    exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, na_prob_thresh)
    f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, na_prob_thresh)
    out_eval = make_eval_dict(exact_thresh, f1_thresh)
    
    if has_ans_qids:
            has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=[k for k in f1_thresh if k in has_ans_qids])
#             print('has_ans_eval',has_ans_eval)
            merge_eval(out_eval, has_ans_eval, 'HasAns')
    if no_ans_qids:
            no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=[k for k in f1_thresh if k in no_ans_qids])
#             print('no_ans_eval',no_ans_eval)
#             print(len(exact_thresh))
            merge_eval(out_eval, no_ans_eval, 'NoAns')
    if has_na_prob_score:
            find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
    return out_eval


In [6]:
dev_data = BatchGen(gen_name(args.data_dir, args.dev_data, version),
                      batch_size=8,
                      gpu=False, is_train=False)

results, labels = predict_squad(model,dev_data,v2_on=True)

dev_labels = load_squad_v2_label('/media/frankie/Data/data/msmarco/msmarco_squad_dev.json')
dev_gold = load_squad('/media/frankie/Data/data/msmarco/msmarco_squad_dev.json')
metric = my_evaluation(dev_gold, results, na_prob_thresh=.5)

em, f1 = metric['exact'], metric['f1']
acc = compute_acc(labels, dev_labels)

print(metric)

Loaded 501 samples out of 501


100%|██████████| 64215/64215 [00:00<00:00, 937594.80it/s]


OrderedDict([('exact', 76.24750499001996), ('f1', 76.57645747466107), ('total', 501), ('HasAns_exact', 5.4945054945054945), ('HasAns_f1', 7.305551591265877), ('HasAns_total', 91), ('NoAns_exact', 91.95121951219512), ('NoAns_f1', 91.95121951219512), ('NoAns_total', 410)])


In [None]:
data = [{'context':"Apple Inc. is an American multinational technology company headquartered in Cupertino, California, that designs, develops, and sells consumer electronics, computer software, and online services. The company's hardware products include the iPhone smartphone, the iPad tablet computer, the Mac personal computer, the iPod portable media player, the Apple Watch smartwatch, the Apple TV digital media player, and the HomePod smart speaker. Apple's software includes the macOS and iOS operating systems, the iTunes media player, the Safari web browser, and the iLife and iWork creativity and productivity suites, as well as professional applications like Final Cut Pro, Logic Pro, and Xcode. Its online services include the iTunes Store, the iOS App Store and Mac App Store, Apple Music, and iCloud. Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976 to develop and sell Wozniak's Apple I personal computer. It was incorporated as Apple Computer, Inc., in January 1977, and said to be a tech unicorn.",
         'question':'Who is frankie liuzzi?',
         'uid':'n2iu2b342'},
       {'context':'The second context is called Secondarial.  The third is not.',
       'question':'What is the second context called?',
       'uid':'zus3bui2'}]


v2_on = True
is_train = False
passages = [reform_text(sample['context']) for sample in data]
passage_tokened = [doc for doc in NLP.pipe(passages, batch_size=1000, n_threads=3)]

question_list = [reform_text(sample['question']) for sample in data]
question_tokened = [question for question in NLP.pipe(question_list, batch_size=1000, n_threads=3)]


generated_data = []

for idx, sample in enumerate(data):
    feat_dict = feature_func_eval(sample, question_tokened[idx], passage_tokened[idx], vocab, vocab_tag, vocab_ner, is_train, v2_on)
    generated_data.append(feat_dict)

pred_data = BatchGen( None,
                  batch_size=args.batch_size,
                  gpu=False, 
                 is_train=False, 
                 data_json=generated_data)


top_k = 1
for batch in pred_data:
    start,end,lab = model.predict_eval(batch)

    max_len = model.opt['max_len'] or start.size(1)
    doc_len = start.size(1)
    pos_enc = model.position_encoding(doc_len, max_len)
    
    for i,r in enumerate(lab):
        scores = torch.ger(start[i], end[i])
        scores = scores * pos_enc
        scores.triu_()
        scores = scores.numpy()
        
        label_score = float(lab[i])

        
        for k in range(1,top_k+1):
            print()
            best_idx = np.argpartition(scores, -k, axis=None)[-k]
            best_score = np.partition(scores, -k, axis=None)[-k]
            s_idx, e_idx = np.unravel_index(best_idx, scores.shape)
            

            beginning_index = 0
            for z in range(s_idx-1,0,-1):
                
                cur_tok = passage_tokened[i][z].text
                

                if cur_tok == '.' or cur_tok == '!' or cur_tok == '?' or cur_tok == '\n' or cur_tok == ']':
                    beginning_index = z+1
                    break

            end_index = len(passage_tokened[i])
            for z in range(e_idx,len(passage_tokened[i])):
                
                cur_tok = passage_tokened[i][z].text
                
                
                if cur_tok == '.' or cur_tok == '!' or cur_tok == '?' or cur_tok == '\n' or cur_tok == '[':
                    end_index = z+1
                    break

            snippet = passage_tokened[i][beginning_index:end_index]


            if label_score > .5:
                print('No Answer :',best_score,label_score)
    #             print(passage_tokened[i][s_idx:e_idx+1], best_score, label_score)
            else:
                print(passage_tokened[i][s_idx:e_idx+1].text, best_score, label_score)
                print(snippet.text)

In [None]:
passage_tokened[i]

In [None]:
doc = NLP('My name is Donald Trump and I work at Google')

In [None]:
for w in doc:
    print(w.ent_type_, w.ent_iob_)

In [None]:
def match_func(question, context):
    counter = Counter(w.text.lower() for w in context)
    total = sum(counter.values())
    freq = [counter[w.text.lower()] / total for w in context]
    question_word = {w.text for w in question}
    question_lower = {w.text.lower() for w in question}
    question_lemma = {w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower() for w in question}
    match_origin = [1 if w in question_word else 0 for w in context]
    match_lower = [1 if w.text.lower() in question_lower else 0 for w in context]
    match_lemma = [1 if (w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower()) in question_lemma else 0 for w in context]
    features = np.asarray([freq, match_origin, match_lower, match_lemma], dtype=np.float32).T.tolist()
    return features

In [None]:
match_func(NLP('What is the president?'),NLP('What the president is Donald Trump and he was a man'))