# Annotation tool

This tool is used to annotate links. The tool is configured to annotate fb15k and dbpedia.

First, specify the paths of the test files

In [24]:
import pickle as pkl
import numpy
import copy
import torch
import json
import ipywidgets as widgets
from ipywidgets import interact, Layout, ButtonStyle
from IPython.display import Markdown
import requests
from time import sleep
import random
import datetime
import os
import urllib
from functools import cmp_to_key

In [25]:
db_name = 'fb15k237'
main_folder = './binary-embeddings/' + db_name + '/'
annotations_file = main_folder + "annotations/gold-annotations.json"
testdata_raw_path = '../benchmarks/' + db_name + '/test2id.txt'
annotator = 'J'
mode = 1 # If the mode is 0, then we always try to select new queries first. If it's 1, we will select queries already annotated

In [26]:
testdata_folder = main_folder + 'answers/'
testdata_transe_path_head = testdata_folder + db_name + '-answers-transe-test-10-head.pkl'
testdata_transe_path_tail = testdata_folder + db_name + '-answers-transe-test-10-tail.pkl'
testdata_complex_path_head = testdata_folder + db_name + '-answers-complex-test-10-head.pkl'
testdata_complex_path_tail = testdata_folder + db_name + '-answers-complex-test-10-tail.pkl'
testdata_rotate_path_head = testdata_folder + db_name + '-answers-rotate-test-10-head.pkl'
testdata_rotate_path_tail = testdata_folder + db_name + '-answers-rotate-test-10-tail.pkl'

Load the dictionary

In [27]:
ent_labels_path = '../benchmarks/%s/entity2id.txt' % db_name
rel_labels_path = '../benchmarks/%s/relation2id.txt' % db_name

In [28]:
ent_labels = {}
with open(ent_labels_path, 'rt') as f:
    nents = int(f.readline())
    for line in f:
        tkns = line.split('\t')
        ent_labels[int(tkns[1])] = tkns[0]
    assert(len(ent_labels) == nents)
rel_labels = {}
with open(rel_labels_path, 'rt') as f:
    nrels = int(f.readline())
    for line in f:
        tkns = line.split('\t')
        rel_labels[int(tkns[1])] = tkns[0]
    assert(len(rel_labels) == nrels)

Load the raw test triples

In [29]:
raw_test_triples = set()
with open(testdata_raw_path, 'rt') as f:
    nfacts = int(f.readline())
    for l in f:
        tkns = l.split(' ')
        h = int(tkns[0])
        t = int(tkns[1])
        r = int(tkns[2])
        raw_test_triples.add((h, t, r))

Load the test files

In [30]:
with open(testdata_transe_path_head, 'rb') as fin:
    testdata_transe_head = pkl.load(fin)
with open(testdata_transe_path_tail, 'rb') as fin:
    testdata_transe_tail = pkl.load(fin)
with open(testdata_complex_path_head, 'rb') as fin:
    testdata_complex_head = pkl.load(fin)
with open(testdata_complex_path_tail, 'rb') as fin:
    testdata_complex_tail = pkl.load(fin)
with open(testdata_rotate_path_head, 'rb') as fin:
    testdata_rotate_head = pkl.load(fin)
with open(testdata_rotate_path_tail, 'rb') as fin:
    testdata_rotate_tail = pkl.load(fin)

Compute all the head and tail queries

In [31]:
queries_tail = {}
for name, testset in [("transe", testdata_transe_tail), ("complex", testdata_complex_tail), ("rotate", testdata_rotate_tail)]:
    for t in testset:
        ent = t['ent']
        rel = t['rel']
        if (ent, rel) in queries_tail:
            answers = queries_tail[(ent, rel)]
            if name not in answers:
                answers[name] = t['answers_fil']
        else:
            a = { name : t['answers_fil'] }            
            queries_tail[(ent, rel)] = a

queries_head = {}
for name, testset in [("transe", testdata_transe_head), ("complex", testdata_complex_head), ("rotate", testdata_rotate_head)]:
    for t in testset:
        ent = t['ent']
        rel = t['rel']    
        if (ent, rel) in queries_head:
            answers = queries_head[(ent, rel)]
            if name not in answers:
                answers[name] = t['answers_fil']
        else:
            a = { name : t['answers_fil'] }           
            queries_head[(ent, rel)] = a

Copy all the queries into a single list. Also, load all the queries previously annotated.

In [54]:
queries = []
counter = 0
for q, a in queries_head.items():
    queries.append({'id': counter, 'type': 0, 'ent' : q[0], 'rel' : q[1], 'answers' : a})
    counter += 1    
for q, a in queries_tail.items():
    queries.append({'id': counter, 'type': 1, 'ent' : q[0], 'rel' : q[1], 'answers' : a})    
    counter += 1
print("# Queries:", len(queries))

# Queries: 22850


Global data structures

In [55]:
out = widgets.Output(layout={'padding': '5px', 'border': '1px solid black'})
array_answers = []
valid_annotations = True
current_query_id = None
processed_queries = {}
n_valid_queries = 0
if os.path.exists(annotations_file):
    print("Loading annotations from file", annotations_file)
    processed_queries = json.load(open(annotations_file, 'rt'))
    new_processed_queries = {}
    for k, v in processed_queries.items():
        new_processed_queries[int(k)] = v
        if v['valid_annotations'] == True:
            n_valid_queries += 1
    processed_queries = new_processed_queries
print("# Processed Queries:", len(processed_queries), " # valid:", n_valid_queries)

Loading annotations from file ./binary-embeddings/fb15k237/annotations/gold-annotations.json
# Processed Queries: 260  # valid: 250


### Auxiliary functions

In [40]:
def pick_next_query():
    global current_query_id
    global processed_queries
    if mode == 0:
        if len(processed_queries) < len(queries):            
            n_attempts = 10
            attempt = 0
            found = False
            while attempt < n_attempts:
                idx = random.randint(0, len(queries) - 1)
                if idx not in processed_queries:
                    found = True
                    break
                attempt += 1
            if found:
                current_query_id = idx
            else:
                # Pick the first ID that is not in processed_queries
                for i in range(len(queries)):
                    if i not in processed_queries:
                        found = True
                        current_query_id = i
                        break
                assert(found)
            return True        
        else:
            return None
    else: # mode=1
        # First pick a query not annotated by the current annotator
        for key, query in processed_queries.items():
            annotated_answers = query['annotated_answers']
            found = False
            for annotated_answer in annotated_answers:
                c = annotated_answer['checked']
                for annotation in c:
                    if annotation['annotator'] == annotator:                        
                        found = True
                        break
            if not found:
                # Good, found one
                current_query_id = key
                return True
        return None

In [41]:
def on_change_checkbox(b):
    owner = b['owner']
    desc = owner.description
    id_answer = int(owner.description[0:desc.find('.')])
    value = b['new']
    found = False
    for ans_annotator in array_answers[id_answer]['checked']:
        if ans_annotator['annotator'] == annotator:
            found = True        
            if value is True:
                ans_annotator['checked'] = True
            else:
                ans_annotator['checked'] = False
            break
    assert(found)

In [42]:
def dump_on_file():
    # First check if the file exist
    if os.path.exists(annotations_file):
        now = str(datetime.datetime.now())
        old_file = annotations_file + '-' + now
        os.rename(annotations_file, old_file)
    json.dump(processed_queries, open(annotations_file, 'wt'), indent = 6)

In [43]:
def on_click_button(b):
    global processed_queries
    global current_query_id
    global valid_annotations
    
    out.clear_output()
    # Store the annotation
    query = queries[current_query_id]
    if current_query_id in processed_queries:
        annotators = processed_queries[current_query_id]['annotators']
    else:
        annotators = []
    annotators.append({'annotator' : annotator, 'date' : str(datetime.datetime.now()) })
    processed_queries[current_query_id] = {'query' : query, 'valid_annotations' : valid_annotations, 'annotated_answers' : array_answers, 'annotators' : annotators }
    dump_on_file()
    
    # Move to the next query
    with out:
        ok = pick_next_query()
        if ok is not None:
            query = queries[current_query_id]
            print_query_answers(query['id'], query['type'], query['ent'], query['rel'], query['answers'])

In [44]:
def on_click_skip_button(b):
    global valid_annotations
    valid_annotations = False
    on_click_button(b)

In [47]:
def retrieve_wikidata_label(e):
    # Query Wikidata
    try:
        query = 'PREFIX wd: <http://www.wikidata.org/entity/> ' + 'PREFIX wdt: <http://www.wikidata.org/prop/direct/> ' + "SELECT ?x ?xLabel WHERE { ?x wdt:P646 \"" + e + "\"; SERVICE wikibase:label { bd:serviceParam wikibase:language \"[AUTO_LANGUAGE],en\". } }"
        r = requests.get('https://query.wikidata.org/bigdata/namespace/wdq/sparql', params = {'format': 'json', 'query': query})
        if r:
            r = r.json()
            results = r['results']
            bindings = results['bindings']
            # Take the first
            binding = bindings[0]
            value = binding['x']['value']
            lbl = binding['xLabel']['value']
            return lbl, value
    except:
        pass
    return 'None', 'None'

In [52]:
def print_query_answers(query_id, typ, ent, rel, answers):
    global processed_queries
    global array_answers
    global valid_annotations
    valid_annotations = True
    n_skipped = 0
    n_ok = 0
    n_annotated_answers = 0
    n_tail_queries = 0
    n_head_queries = 0
    for _, q in processed_queries.items():
        if q['valid_annotations']:
            n_ok += 1
            n_annotated_answers += len(q['annotated_answers'])
            if q['query']['type'] == 1:
                n_tail_queries += 1
            else:
                n_head_queries += 1
        else:
            n_skipped += 1
    print("Processed queries: {} Skipped: {} Ok: {} Head: {} Tail: {}".format(len(processed_queries), n_skipped, n_ok, n_head_queries, n_tail_queries))
    print("Annnotated answers: {}\n".format(n_annotated_answers))
    typ_str = 'HEAD'
    if typ == 1:
        typ_str = 'TAIL'
    display(Markdown("***Query #{} Type {}***".format(query_id, typ_str)))
    if db_name == 'fb15k237':
        lbl, link_wikidata = retrieve_wikidata_label(ent_labels[ent])
        ent_str = '[' + lbl + ' ' + link_wikidata + ' (' + ent_labels[ent] + ')]'   
    else:
        lbl, link_wikidata = (ent_labels[ent], "")
        ent_str = lbl
    if typ == 0:
        print("?", rel_labels[rel], ent_str)
    else:
        print(ent_str, rel_labels[rel], "?")
    
    previous_annotations = None
    array_answers = []
    if mode == 1 and query_id in processed_queries:
        print("\nThis query was previously annotated by" + str(processed_queries[query_id]['annotators']))
        previous_annotations = processed_queries[query_id]['annotated_answers']
        array_answers = copy.deepcopy(previous_annotations)

    lbl_google = urllib.parse.urlencode({"q" : lbl})
    google_link = "https://www.google.com/search?hl=en&" + lbl_google
    display(Markdown("***Search on Google:*** {}".format(google_link)))
    print("\nAnswers (striked answers are the ones that are in the testset):")    
    for method, answers_method in answers.items():
        for i, answer in enumerate(answers_method):
            a = answer['entity_id']
            # Should I add it?
            found = False
            for j, array_answer in enumerate(array_answers):
                if array_answer['entity_id'] == a:
                    found = True
                    # add the method if not already there
                    method_found = False                    
                    for m in array_answer['methods']:
                        if m == method:
                            method_found = True
                            break
                    if not method_found:
                        array_answer['methods'].append(method)
                    # Add an entry with the current annotator if it does not exist
                    annotator_found = False
                    for c in array_answer['checked']:
                        if c['annotator'] == annotator:
                            annotator_found = True
                            break
                    if not annotator_found and array_answer['enabled']:
                        array_answer['checked'].append({'annotator' : annotator, 'checked' : False })
                    break                    
            if not found:              
                # Is the answer known to be true?
                found = False
                if typ == 0 and (a, ent, rel) in raw_test_triples:
                    found = True
                if typ == 1 and (ent, a, rel) in raw_test_triples:
                    found = True
                if found:
                    array_answers.append({'entity_id' : a, 'checked' : [{'annotator' : 'Testset', 'checked' : True }], 'methods': [method], 'enabled' : False})
                else:
                    # If mode==1, I'm hiding the previous annotations so that two different annotators may annotate with different values
                    array_answers.append({'entity_id' : a, 'checked' : [{'annotator' : annotator, 'checked' : False }], 'methods': [method], 'enabled' : True})
                        
    for i, a in enumerate(array_answers):
        if db_name == 'fb15k237':
            sleep(1) # Some sleeping is necessary for wikidata            
            lbl, link_wikidata = retrieve_wikidata_label(ent_labels[a['entity_id']])
        else:
            lbl, link_wikidata = (ent_labels[a['entity_id']],'')
        a_str = lbl
        desc = "{}. {} ({}) methods={}".format(i, a_str, a['entity_id'], a['methods'])
        if a['enabled'] == False:
            assert(a['checked'][0]['annotator'] == 'Testset')
            assert(a['checked'][0]['checked'] == True)
            box = widgets.Checkbox(True, id=i, description="<strike>" + desc + "</strike>", layout=Layout(width='2000px', height='20px'), indent=False, disabled=True)
        else:
            box = widgets.Checkbox(False, id=i, description=desc, layout=Layout(width='2000px', height='20px'), indent=False)
        box.observe(on_change_checkbox, names="value")
        display(box)
        if a['enabled'] == True:
            lbl_google = urllib.parse.urlencode({"q" : lbl})
            google_link = "https://www.google.com/search?hl=en&" + lbl_google
            lbl_wikipedia = urllib.parse.urlencode({"search" : lbl})
            wikipedia_link = "https://en.wikipedia.org/w/index.php?" + lbl_wikipedia
            display(Markdown("&ensp;&ensp;&ensp;{} {}".format(google_link, wikipedia_link)))

    print("\n")
    display(Markdown("***Known answers from the testset:***"))
    known_answers = []
    for triple in raw_test_triples:
        if triple[2] == rel:
            if typ == 0 and triple[1] == ent:
                known_answers.append(triple[0])
            if typ == 1 and triple[0] == ent:
                known_answers.append(triple[1])
    assert(len(known_answers) > 0)
    for known_answer in known_answers:
        lbl, link_wikidata = (ent_labels[known_answer],'')
        a_str = lbl
        desc = "{} ({})".format(a_str, known_answer)
        print(desc)

### Start the annotation process

In [53]:
out.clear_output()
ok = pick_next_query()
with out:    
    if ok is not None:
        query = queries[current_query_id]
        print_query_answers(query['id'], query['type'], query['ent'], query['rel'], query['answers'])
b = widgets.Button(description='Submit', style=ButtonStyle(font_weight='bf'))
b.on_click(on_click_button)
b_skip = widgets.Button(description='Skip', style=ButtonStyle(font_weight='bf'))
b_skip.on_click(on_click_skip_button)
display(out)
display(b)
display(b_skip)

Output(layout=Layout(border='1px solid black', padding='5px'), outputs=({'name': 'stdout', 'text': 'Processed …

Button(description='Submit', style=ButtonStyle(font_weight='bf'))

Button(description='Skip', style=ButtonStyle(font_weight='bf'))