In [1]:
import re
import os
import sys
import random
import string
import logging
import argparse
import json
import torch
import msgpack
import spacy
import pandas as pd
import numpy as np
from shutil import copyfile
from datetime import datetime
from collections import Counter, defaultdict
import pickle
from src.model import DocReaderModel
from src.batcher import load_meta_with_vocab, BatchGen
from my_utils.data_utils import feature_func_eval
from my_utils.tokenizer import Vocabulary, reform_text
from config import set_args
from my_utils.utils import set_environment
from my_utils.log_wrapper import create_logger
from my_utils.squad_eval import evaluate
from my_utils.data_utils import predict_squad, gen_name, load_squad_v2_label, compute_acc
from my_utils.squad_eval_v2 import my_evaluation as evaluate_v2
import sys
print(torch.cuda.is_available())
import math

False


## Setup model and vocab

In [2]:
sys.argv = ['train.py']
version = 'v2'
model_filepath = 'resource/em_67_f1_69.pt'
args = set_args()

opt = vars(args)
print(args.cuda)

embedding, opt, vocab = load_meta_with_vocab(opt, gen_name(args.data_dir, args.meta, version, suffix='pick'))

print(opt)
model = DocReaderModel.load(model_filepath,embedding,gpu=args.cuda)


model.setup_eval_embed(embedding)

if args.cuda:
    model.cuda()
else:
    model.cpu()

with open(os.path.join('resource', 'vocab_tag.pick'),'rb') as f:
    vocab_tag = pickle.load(f)
with open(os.path.join('resource','vocab_ner.pick'),'rb') as f:
    vocab_ner = pickle.load(f)
    
NLP = spacy.load('en')
print('Done loading model and resources')

False
{'v2_on': False, 'log_file': 'san.log', 'data_dir': 'data/', 'meta': 'meta', 'train_data': 'train_data', 'dev_data': 'dev_data', 'dev_gold': 'dev', 'test_data': 'test_data', 'test_gold': 'test', 'covec_path': 'data/MT-LSTM.pt', 'glove': 'data/glove.840B.300d.txt', 'sort_all': False, 'threads': 8, 'vocab_size': 90953, 'covec_on': True, 'embedding_dim': 300, 'fasttext_on': False, 'pos_on': True, 'pos_vocab_size': 54, 'pos_dim': 12, 'ner_on': True, 'ner_vocab_size': 41, 'ner_dim': 8, 'feat_on': True, 'num_features': 4, 'prealign_on': True, 'prealign_head': 1, 'prealign_att_dropout': 0, 'prealign_norm_on': False, 'prealign_proj_on': False, 'prealign_bidi': False, 'prealign_hidden_size': 300, 'prealign_share': True, 'prealign_residual_on': False, 'prealign_scale_on': True, 'prealign_sim_func': 'dotproductproject', 'prealign_activation': 'relu', 'pwnn_on': True, 'pwnn_hidden_size': 300, 'contextual_hidden_size': 300, 'contextual_cell_type': 'lstm', 'contextual_weight_norm_on': False, '

In [None]:
def position_encoding( m, threshold=5):
    encoding = np.ones((m, m), dtype=np.float32)
    for i in range(m):
        for j in range(i, m):
            if j - i > threshold:
                encoding[i][j] = float(1.0 / math.log(j - i + 1))
    return torch.from_numpy(encoding)

In [None]:
position_encoding(20)


In [None]:
dev_data = BatchGen(gen_name(args.data_dir, args.dev_data, version),
                      batch_size=args.batch_size,
                      gpu=False, is_train=False)

In [None]:
def predict_squad(model, data, v2_on=False):
    data.reset()
    span_predictions = {}
    label_predictions = {}
    for batch in data:
        phrase, spans, scores = model.predict(batch)
        print(batch)
        uids = batch['uids']
        for uid, pred in zip(uids, phrase):
            span_predictions[uid] = pred
        if v2_on:
            for uid, pred in zip(uids, scores):
                label_predictions[uid] = pred
        break
    return span_predictions, label_predictions

In [None]:
predict_squad(model,dev_data,v2_on=True)

In [3]:
data = [{'context':"Apple Inc. is an American multinational technology company headquartered in Cupertino, California, that designs, develops, and sells consumer electronics, computer software, and online services. The company's hardware products include the iPhone smartphone, the iPad tablet computer, the Mac personal computer, the iPod portable media player, the Apple Watch smartwatch, the Apple TV digital media player, and the HomePod smart speaker. Apple's software includes the macOS and iOS operating systems, the iTunes media player, the Safari web browser, and the iLife and iWork creativity and productivity suites, as well as professional applications like Final Cut Pro, Logic Pro, and Xcode. Its online services include the iTunes Store, the iOS App Store and Mac App Store, Apple Music, and iCloud. Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976 to develop and sell Wozniak's Apple I personal computer. It was incorporated as Apple Computer, Inc., in January 1977, and said to be a tech unicorn.",
         'question':'Wha?',
         'uid':'n2iu2b342'},
       {'context':'The second context is called Secondarial.  The third is not.',
       'question':'What is the second context called?',
       'uid':'zus3bui2'}]


v2_on = True
is_train = False
passages = [reform_text(sample['context']) for sample in data]
passage_tokened = [doc for doc in NLP.pipe(passages, batch_size=1000, n_threads=3)]

question_list = [reform_text(sample['question']) for sample in data]
question_tokened = [question for question in NLP.pipe(question_list, batch_size=1000, n_threads=3)]


generated_data = []

for idx, sample in enumerate(data):
    feat_dict = feature_func_eval(sample, question_tokened[idx], passage_tokened[idx], vocab, vocab_tag, vocab_ner, is_train, v2_on)
    generated_data.append(feat_dict)

pred_data = BatchGen( None,
                  batch_size=args.batch_size,
                  gpu=False, 
                 is_train=False, 
                 data_json=generated_data)


top_k = 1
for batch in pred_data:
    start,end,lab = model.predict_eval(batch)

    max_len = model.opt['max_len'] or start.size(1)
    doc_len = start.size(1)
    pos_enc = model.position_encoding(doc_len, max_len)
    
    for i,r in enumerate(lab):
        scores = torch.ger(start[i], end[i])
        scores = scores * pos_enc
        scores.triu_()
        scores = scores.numpy()
        
        label_score = float(lab[i])

        
        for k in range(1,top_k+1):
            print()
            best_idx = np.argpartition(scores, -k, axis=None)[-k]
            best_score = np.partition(scores, -k, axis=None)[-k]
            s_idx, e_idx = np.unravel_index(best_idx, scores.shape)
            

            beginning_index = 0
            for z in range(s_idx-1,0,-1):
                
                cur_tok = passage_tokened[i][z].text
                

                if cur_tok == '.' or cur_tok == '!' or cur_tok == '?' or cur_tok == '\n' or cur_tok == ']':
                    beginning_index = z+1
                    break

            end_index = len(passage_tokened[i])
            for z in range(e_idx,len(passage_tokened[i])):
                
                cur_tok = passage_tokened[i][z].text
                
                
                if cur_tok == '.' or cur_tok == '!' or cur_tok == '?' or cur_tok == '\n' or cur_tok == '[':
                    end_index = z+1
                    break

            snippet = passage_tokened[i][beginning_index:end_index]


            if label_score > .5:
                print('No Answer :',best_score,label_score)
    #             print(passage_tokened[i][s_idx:e_idx+1], best_score, label_score)
            else:
                print(passage_tokened[i][s_idx:e_idx+1].text, best_score, label_score)
                print(snippet.text)




Apple Inc. 0.0051208297 0.38858845829963684
Apple Inc. is an American multinational technology company headquartered in Cupertino, California, that designs, develops, and sells consumer electronics, computer software, and online services.

Secondarial 0.9864363 0.41445234417915344
The second context is called Secondarial.


In [None]:
passage_tokened[i]

In [None]:
doc = NLP('My name is Donald Trump and I work at Google')

In [None]:
for w in doc:
    print(w.ent_type_, w.ent_iob_)

In [None]:
def match_func(question, context):
    counter = Counter(w.text.lower() for w in context)
    total = sum(counter.values())
    freq = [counter[w.text.lower()] / total for w in context]
    question_word = {w.text for w in question}
    question_lower = {w.text.lower() for w in question}
    question_lemma = {w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower() for w in question}
    match_origin = [1 if w in question_word else 0 for w in context]
    match_lower = [1 if w.text.lower() in question_lower else 0 for w in context]
    match_lemma = [1 if (w.lemma_ if w.lemma_ != '-PRON-' else w.text.lower()) in question_lemma else 0 for w in context]
    features = np.asarray([freq, match_origin, match_lower, match_lemma], dtype=np.float32).T.tolist()
    return features

In [None]:
match_func(NLP('What is the president?'),NLP('What the president is Donald Trump and he was a man'))