In [None]:
from os.path import join
import re
import json
import pickle

from tqdm import tqdm
from difflib import SequenceMatcher
from collections import Counter

from allennlp.models.archival import load_archive
from allennlp.predictors import Predictor

DIR = './objects'
TITLES = 'titles_gensim_70.pkl'
TRAIN_SET = './train.json'

In [None]:
archive = load_archive(
    "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")

predictor = Predictor.from_archive(archive, 'constituency-parser')

# Trying

In [None]:
sentence = "Nikolaj Coster-Waldau worked with the Fox Broadcasting Company."
result = predictor.predict_json({"sentence": sentence})

print(result.keys())
pos_tags = result['pos_tags']
print(pos_tags)
tokens = result['tokens']
print(tokens)

In [None]:
print(result['hierplane_tree']['root'])
# keys of result['hierplane_tree']['root']: ['word', 'nodeType', 'attributes', 'link', 'children']
for child in result['hierplane_tree']['root']['children']:
    print(child)

# Calc doc retrieval accuracy

### Load titles

In [None]:
titles = list(pickle.load(open(join(DIR, TITLES), "rb" )).values())

def title_without_parentheses(title):
    return re.sub('-LRB-.*-RRB-', '', title).strip('_')

without_parentheses = []
for title in titles:
    tit = title_without_parentheses(title)
    if title != tit:
        without_parentheses.append(tit)

titles += without_parentheses

### Load train_set

In [None]:
with open(TRAIN_SET, 'r') as train_set_f:
    train_set = json.load(train_set_f)

In [None]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
ignore_list = ['-LRB-', '-RRB-']

In [None]:
def get_NPs(parse_result):
    # TODO: 
    # 1. everything before verb as a NP
    # 2. deal with different encoding

    NPs, NP = set(), []
    
    # by hierplane_tree
    hierplane_tree_children = parse_result['hierplane_tree']['root']['children']
    for child in hierplane_tree_children:
        if child['nodeType'] in ['NP', 'HYPH'] :
            NP += child['word'].split()
        elif NP:
            NPs.add("_".join(NP))
            NP = []
            
    if NP:
        NPs.append(NP)
        NP = []
        
    # by customised rules
    pos_tags = parse_result['pos_tags']
    tokens = parse_result['tokens']
    
    for id_, tag in enumerate(pos_tags):
        if tag in ['NP', 'NNP', 'HYPH']:
            NP.append(tokens[id_])
        elif tag in ignore_list:
            NP.append(tag)
        elif NP:
            NPs.add("_".join(NP))
            NP = []

    return list(map(lambda NP: re.sub('_-_', '-', NP), NPs))

In [None]:
found_evidence, total_evidence, true_evidence = 0, 0, 0

for _id, record in tqdm(train_set.items()):
    evidence = list(map(lambda x: x[0], record['evidence']))
    true_evidence += len(evidence)
    
    parse_result = predictor.predict_json({"sentence": record['claim']})
    NPs = get_NPs(parse_result)
    total_evidence += len(NPs)
            
    missing = []
    for evi in evidence:
        got = False
        for NP in NPs:
            if NP in evi:
                got = True
                break
        if got:
            found_evidence += 1
        else:
            missing.append(evi)

    if missing:
        print('In claim:', record['claim'])
        print('    NPs:', NPs)
        print('    missing:', missing)
        #print('    POS:', (pred_result['hierplane_tree']['root']['children']))
        print('    POS:', parse_result['pos_tags'])

In [None]:
print(true_evidence, found_evidence, total_evidence)
print("accuracy:", found_evidence / true_evidence)
print("recall:", found_evidence / total_evidence)