In [94]:
import requests
import json
import sys
from os.path import join
import pickle
sys.path.append('..')
from utils import *
from collections import defaultdict
import tagme
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt

In [95]:
data_path = '/home/rohitalyosha/Student_Job/mannheim-nel/data'
datasets = ['conll-train', 'conll-dev', 'msnbc', 'ace2004']
tagme.GCUBE_TOKEN = "88c693df-a43f-4086-b3bc-0b555bfbc9bb-843339462"

In [96]:
rd = json_load(join(data_path, 'dicts/redirects.json'))
ent2id = json_load('/home/rohitalyosha/Student_Job/mannheim-nel/data/dicts/ent_dict.json')

In [97]:
id2c = {}
id2c_conll = pickle_load(join(data_path, 'Conll', 'conll_raw_text.pickle'))
id2c['conll-train'] = id2c_conll['train']
id2c['conll-dev'] = id2c_conll['dev']
examples = {}

for d_name in datasets[2:]:
    id2c[d_name], examples[d_name] = pickle_load(join(data_path, 'datasets', f'raw_{d_name}.pickle'))

for d_name in datasets[:2]:
    _, examples[d_name] = pickle_load(join(data_path, 'Conll', f"conll-{d_name.split('-')[-1]}.pickle"))

In [98]:
gold = {dataset : {} for dataset in datasets}
for dataset, exs in examples.items():
    for ex in exs:
        c_id, (mention, ent_str, span, _) = ex
        if c_id not in gold[dataset]:
            gold[dataset][c_id] = {'mentions': [],
                          'ents': [],
                          'spans': []}
        gold[dataset][c_id]['mentions'].append(mention)
        gold[dataset][c_id]['ents'].append(ent_str)
        gold[dataset][c_id]['spans'].append(span)

In [99]:
def get_response_mention(doc_id, ent_strs, text, user_mentions, user_spans):
    data_json = json.dumps({'ent_strs': ent_strs,
                            'doc_id': doc_id,
                            'text': text,
                            'mentions': user_mentions, 
                            'spans': user_spans})
    response_json = requests.post("http://127.0.0.1:5000/link", data=data_json).json()
    ents = response_json['entities']
    mentions = response_json['mentions']
    
    return ents, mentions

In [100]:
def get_mention_results(num_text, dataset='conll-dev'):
    results = {}
    times = []
    for doc_id, text in list(id2c[dataset].items())[:num_text]:
        if doc_id not in gold[dataset]:
            continue
        results[doc_id] = {}
        user_mentions = gold[dataset][doc_id]['mentions']
        user_spans = gold[dataset][doc_id]['spans']
        user_ents = gold[dataset][doc_id]['ents']
        print(user_ents)
        try:
            ents, mentions = get_response_mention(f'{dataset}-{doc_id}', user_ents, text, user_mentions, user_spans)
        except Exception as e:
            print(Text, user_mentions)
        results[doc_id]['mentions'] = mentions
        results[doc_id]['ents'] = ents

    return results

## Ours

#### Eval only linking

In [109]:
DATASET = 'ace2004'

In [110]:
mention_results = get_mention_results(10000, dataset=DATASET)

num_correct = 0
total = 0
num_no_link = 0
no_links = []

for k, v in mention_results.items():
    if k not in gold[DATASET]:
        print(k, v)
        continue
    gold_ents = gold[DATASET][k]['ents']
    pred_ents = v['ents']
    gold_mentions = gold[DATASET][k]['mentions']
    for i, (gold_ent, pred_ent) in enumerate(zip(gold_ents, pred_ents)):
        gold_ent = rd.get(gold_ent, gold_ent)
        pred_ent = rd.get(pred_ent, pred_ent)
        total += 1
        if pred_ent == 'NO LINK FOUND':
            num_no_link += 1
            no_links.append(gold_ent)
        if gold_ent == pred_ent:
            num_correct += 1
        else:
            pass

['Baghdad', 'Agence_France-Presse', 'Iraq']
['Tallahassee,_Florida', 'Florida', 'Supreme_Court_of_Florida', 'Florida_District_Courts_of_Appeal', 'Electoral_College_(United_States)']
['BBC_News', 'London', 'Supreme_Court_of_the_United_States', 'Ruth_Bader_Ginsburg', 'Supreme_Court_of_Florida', "Sandra_Day_O'Connor", 'Washington,_D.C.']
['Bandar_Seri_Begawan', 'Agence_France-Presse', 'Florida', 'Brunei']
['Washington,_D.C.', 'Supreme_Court_of_the_United_States', 'Joel_Grossman', 'Baltimore', 'Johns_Hopkins_University', 'Laurence_Tribe', 'Al_Gore', 'Florida_Legislature', 'United_States_Congress']
['Xinhua_News_Agency', 'Beijing', 'Weir_Group', 'United_States', 'Jiangsu', 'China', 'Rudong_County', 'Japan', 'South_Korea', 'Taiwan']
['Beirut', 'Agence_France-Presse', 'Lebanon', 'Israel']
['Zeist', 'Holland', 'Agence_France-Presse', 'Soviet_Union', 'Sweden', 'Amsterdam', 'Copenhagen', 'Stockholm', 'Lamin_Khalifah_Fhimah', 'Abdelbaset_al-Megrahi', 'Malta']
['Bandar_Seri_Begawan', 'Agence_Franc

In [111]:
print(num_correct, total, num_correct / total)

220 257 0.8560311284046692


## Create Cands Files

In [112]:
import glob

In [113]:
for dataset in datasets:
    f_names = glob.glob(f'/home/rohitalyosha/Student_Job/mannheim-nel/data/cands/{dataset}/*')
    examples = []
    for f_name in f_names:
        doc_id = f_name.split('-')[-1]
        try:
            doc_id = int(doc_id)
        except:
            pass
        with open(f_name) as f:
            for line in f:
                line = line.strip()
                parts = line.split('||')
                if len(parts) > 1:
                    mention = parts[0]
                    ent_str = parts[1]
                    cand_strs = parts[2:]
                    examples.append((doc_id, mention, ent_str, cand_strs))
        
    with open(f'/home/rohitalyosha/Student_Job/mannheim-nel/data/training_files/{dataset}.pickle', 'wb') as f:
        pickle.dump((id2c[dataset], examples), f)

In [115]:
for dataset in datasets:
    covered = 0
    total = 0
    id2c, examples = pickle_load(f'/home/rohitalyosha/Student_Job/mannheim-nel/data/training_files/{dataset}.pickle')
    for doc_id, mention_str, ent_str, cand_gen_strs  in examples:
        total += 1
        ent_str = rd.get(ent_str, ent_str)
        if ent_str in cand_gen_strs[:128]:
            covered += 1
    print(dataset, covered, total, covered/total)

conll-train 18063 18626 0.9697734349833566
conll-dev 4707 4838 0.9729226953286482
msnbc 649 656 0.989329268292683
ace2004 232 258 0.8992248062015504
