In [1]:
import sys
import pandas
import trident
import collections
import elasticsearch
import numpy as np

## Params

In [5]:
csv_triples = '/export/scratch1/home/urbani/results/22mar-t2dv2-dbpedia2014/triples/gold/extracted_triples-GOLD.csv' # Example file
trident_dbpedia = '/export/scratch1/home/urbani/data2/dbpedia2014'
embeddings_dbpedia = '/export/scratch1/home/urbani/data2/dbpedia_embeddings/model-500'
elastic_endpoint = 'localhost:9999'
prefix_dbpedia = 'http://dbpedia.org/resource/'

## Load various inputs

In [6]:
tables = pandas.read_csv(csv_triples)
nrows = len(tables.index)
print(tables.columns.values)

['URL' 'Table' 'Column Index' 'Subject URI' 'Subject Value'
 'Subject Confidence' 'Predicate URI' 'Predicate Value'
 'Predicate Confidence' 'Data Type' 'Object Value' 'Object Value in KB'
 'Object Value matches KB' 'Object Value Similarity']


In [7]:
db = trident.Db(trident_dbpedia)
es = elasticsearch.Elasticsearch()

In [8]:
label_id = db.lookup_relid('<http://www.w3.org/2000/01/rdf-schema#label>')
print(label_id)

15


In [9]:
emb = trident.Emb(embeddings_dbpedia)

## Function to disambiguate the triples

In [35]:
TableResults = collections.namedtuple('TableResults', 'redundant novel errors numeric')
#redundant= number of triples which are already in the KB (and we can verify this easily)
#novel= triples which we can correctly disambiguate and are not in the KB
#errors=triples which are not in the KB but for which we cannot extract a suitable entity (e.g., the KB does not contain suitable labels, etc.)

In [11]:
def getCandidatesFromElasticSearch(entity_label):
    results = []
    try:
        out = es.search(index="dbpedia2014part_intl_labels_labeldocs", body={"query": {"match": {'label': entity_label}}})
        # I only get the top 10 hits at most. They should be enough
        out = out['hits']['hits']
        for hit in out:
            source = hit['_source']
            ids = source['ids']
            for ent in ids:
                full_ent =  '<' + prefix_dbpedia + ent + '>'
                id_ent = db.lookup_id(full_ent)
                if id_ent is not None:
                    results.append(id_ent)
                else:
                    pass                
    except Exception as e:
        print("Failed getting candidates from the KB for", entity_label, " error", str(e))
    if len(results) == 0:
        return None
    else:
        return results

In [41]:
def isNumeric(label):
    if type(label) == float:
        return True
    return label.isdigit()

In [47]:
def processTriples(triples):
    redundant = 0 
    novel = 0 
    errors = 0
    numeric = 0
    
    for triple in triples:
        # Get the ID for the subject
        subject_str = triple[0]
        subject_id = db.lookup_id('<' + subject_str + '>')
        if subject_id is None:
            print("Warning, I cannot find an ID for the resource", subject_str)
            continue
        # Get the ID for the predicate
        predicate_str = triple[1]
        predicate_id = db.lookup_relid('<' + predicate_str + '>')
        if predicate_id is None:
            print("Warning, I cannot find an ID for the resource", predicate_str)
            continue        
        # Get the label of the object
        object_label = triple[2]
        
        # Before we continue, we check whether the KB has already an object for the give s,p
        found_obj = False
        found_errors = False
        is_numeric = False
        
        objects_kb = db.o(subject_id, predicate_id)
        if len(objects_kb) > 0:            
            # Get the labels of the objects
            for object_kb in objects_kb:
                object_kb_ent = db.lookup_str(object_kb)
                if object_kb_ent is not None:
                    object_kb_label = db.o(object_kb, label_id)
                    if len(object_kb_label) > 0:                        
                        found_label = db.lookup_str(object_kb_label[0])
                        if found_label is not None:
                            # TODO: check the two strings are sufficiently similar
                            #print(found_label, object_label)
                            found_obj = True
                            break
                    else:
                        #print("Cannot get the label for", object_kb_ent)
                        found_errors = True
                        break                    
            
        if not found_errors and not found_obj:            
            #First check whether the string is not a numeral or a date
            if isNumeric(object_label):
                is_numeric = True
            else:            
                # If it is not, get some potential candidate from elastic search
                candidates = getCandidatesFromElasticSearch(object_label)
                if candidates is not None:
                    top_cand_before_ranking = candidates[0]
                    # Rank the top-k candidates
                    candidates = rank(subject_id, predicate_id, candidates)
                    top_cand_after_ranking = candidates[0]
                    chosen_candidate = db.lookup_str(top_cand_after_ranking)
                    if top_cand_before_ranking != top_cand_after_ranking:
                        print(subject_str, predicate_str)
                        print(db.lookup_str(top_cand_before_ranking), db.lookup_str(top_cand_after_ranking))
                else:
                    found_errors = True
        
        # Increase the counters
        if found_errors:
            errors += 1
        elif is_numeric:
            numeric += 1
        elif not found_obj:
            novel += 1
        else:
            redundant += 1
    return TableResults(novel=novel, redundant=redundant, errors=errors, numeric=numeric)

In [37]:
def rank(subj, pred, candidates):
    rankedList = []
    # Get the embedding of the subject
    emb_s = emb.get_e(subj)
    # Get the embedding of the predicate
    emb_r = emb.get_r(pred)
    emb_sum = emb_s + emb_r
    k = 0
    for candidate in candidates:
        # Get the embedding of the candidate
        emb_cand = emb.get_e(candidate)
        # Measure the similarity
        sim = np.sum(np.abs(emb_sum - emb_cand))
        #print(sim, candidate, db.lookup_str(candidate))
        rankedList.append((sim, candidate))
        # Only consider the top-10 candidates
        k += 1        
        if k == 10:
            break
    # Sort the list
    rankedList.sort()
    #if rankedList[0][1] != candidates[0]:
    #    print(db.lookup_str(subj), db.lookup_relstr(pred))
    #    print("was", db.lookup_str(candidates[0]), "now", db.lookup_str(rankedList[0][1]))
    candidates = [can for (sim, can) in rankedList]            
    return candidates

## Disambiguate the objects

In [48]:
tidx = 0
for tname, table in tables.groupby('Table'):
    print("Processing table", tidx, tname)
    triples = []
    for rdix, row in table.iterrows():         
        triple = (row['Subject URI'], row['Predicate URI'], row['Object Value'])
        triples.append(triple)        
    print(processTriples(triples))
    tidx += 1
    if tidx >= 10:
        break # For now, just disambiguate one table

Processing table 0 10151359_0_8168779773862259178.csv
TableResults(redundant=238, novel=2, errors=6, numeric=0)
Processing table 1 10630177_0_4831842476649004753.csv
http://dbpedia.org/resource/Princess_Tower http://dbpedia.org/ontology/location
<http://dbpedia.org/resource/Dubai_Municipality> <http://dbpedia.org/resource/Dubai>
http://dbpedia.org/resource/Al_Yaqoub_Tower http://dbpedia.org/ontology/location
<http://dbpedia.org/resource/Dubai_Municipality> <http://dbpedia.org/resource/Dubai>
http://dbpedia.org/resource/Eureka_Tower http://dbpedia.org/ontology/location
<http://dbpedia.org/resource/Division_of_Melbourne> <http://dbpedia.org/resource/Melbourne_City_Centre>
http://dbpedia.org/resource/China_World_Trade_Center_Tower_III http://dbpedia.org/ontology/location
<http://dbpedia.org/resource/SS_City_of_Peking> <http://dbpedia.org/resource/Beijing>
http://dbpedia.org/resource/Princess_Tower http://dbpedia.org/ontology/location
<http://dbpedia.org/resource/Dubai_Municipality> <http:

TableResults(redundant=0, novel=0, errors=430, numeric=0)
Processing table 4 11599512_1_280388135214354946.csv
http://dbpedia.org/resource/Sierra_Leone http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Air_Afrique> <http://dbpedia.org/resource/Africa>
http://dbpedia.org/resource/Barbados http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Centrale_(Croydon)> <http://dbpedia.org/resource/Central_Administrative_Okrug>
http://dbpedia.org/resource/Tanzania http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Air_Afrique> <http://dbpedia.org/resource/Africa>
http://dbpedia.org/resource/Guyana http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Arrondissements_of_the_Corse-du-Sud_department> <http://dbpedia.org/resource/South_Australia>
http://dbpedia.org/resource/England http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Category:Europe_(band)_members> <http://dbpedia.org/resource/Northern_Europe>
http://dbpedia.org/resource/Malt

http://dbpedia.org/resource/Pakistan http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Asia_(Roman_province)> <http://dbpedia.org/resource/Asia>
http://dbpedia.org/resource/Djibouti http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Air_Afrique> <http://dbpedia.org/resource/Africa>
http://dbpedia.org/resource/Maldives http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Asia_(Roman_province)> <http://dbpedia.org/resource/Asia>
http://dbpedia.org/resource/The_Gambia http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Air_Afrique> <http://dbpedia.org/resource/Africa>
http://dbpedia.org/resource/Malaysia http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Asia_(Roman_province)> <http://dbpedia.org/resource/Asia>
http://dbpedia.org/resource/Bulgaria http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Category:Europe_(band)_members> <http://dbpedia.org/resource/East-Central_Europe>
http://dbpedia.org/resource/Ivory_Co

http://dbpedia.org/resource/Gabon http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Air_Afrique> <http://dbpedia.org/resource/Africa>
http://dbpedia.org/resource/Herm http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Category:Europe_(band)_members> <http://dbpedia.org/resource/Northern_Europe>
http://dbpedia.org/resource/Cuba http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Centrale_(Croydon)> <http://dbpedia.org/resource/Centrale_Region,_Togo>
http://dbpedia.org/resource/New_Caledonia http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Anie> <http://dbpedia.org/resource/Oceanside,_California>
http://dbpedia.org/resource/Western_Sahara http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Air_Afrique> <http://dbpedia.org/resource/Africa>
http://dbpedia.org/resource/Jersey http://dbpedia.org/ontology/region
<http://dbpedia.org/resource/Category:Europe_(band)_members> <http://dbpedia.org/resource/Northern_Europe>
http://dbp

http://dbpedia.org/resource/Greenland http://dbpedia.org/ontology/capital
<http://dbpedia.org/resource/Nuuk_(album)> <http://dbpedia.org/resource/Nuuk>
http://dbpedia.org/resource/Macau http://dbpedia.org/ontology/capital
<http://dbpedia.org/resource/Category:Macao> <http://dbpedia.org/resource/Macau>
http://dbpedia.org/resource/French_Guiana http://dbpedia.org/ontology/capital
<http://dbpedia.org/resource/Cayenne_pepper> <http://dbpedia.org/resource/Category:Cayenne>
http://dbpedia.org/resource/Switzerland http://dbpedia.org/ontology/capital
<http://dbpedia.org/resource/Bern,_Idaho> <http://dbpedia.org/resource/Bern>
http://dbpedia.org/resource/Guyana http://dbpedia.org/ontology/timeZone
<http://dbpedia.org/resource/4.0> <http://dbpedia.org/resource/EMV>
http://dbpedia.org/resource/Suriname http://dbpedia.org/ontology/timeZone
<http://dbpedia.org/resource/3.0_(professional_wrestling)> <http://dbpedia.org/resource/BMW_E9>
http://dbpedia.org/resource/Chile http://dbpedia.org/ontology/ti

http://dbpedia.org/resource/Final_Fight http://dbpedia.org/ontology/releaseDate
<http://dbpedia.org/resource/UTC+04:00> <http://dbpedia.org/resource/Category:Wikipedia_Signpost_archives_2007-04>
http://dbpedia.org/resource/Street_Fighter_II_Turbo:_Hyper_Fighting http://dbpedia.org/ontology/releaseDate
<http://dbpedia.org/resource/July_2007> <http://dbpedia.org/resource/UTC+07:00>
http://dbpedia.org/resource/Gradius_III http://dbpedia.org/ontology/releaseDate
<http://dbpedia.org/resource/UTC+09:00> <http://dbpedia.org/resource/Quoted-printable>
http://dbpedia.org/resource/Super_Mario_RPG:_Legend_of_the_Seven_Stars http://dbpedia.org/ontology/releaseDate
<http://dbpedia.org/resource/UTC+08:00> <http://dbpedia.org/resource/OO_gauge>
http://dbpedia.org/resource/Donkey_Kong_Country http://dbpedia.org/ontology/releaseDate
<http://dbpedia.org/resource/Midnight> <http://dbpedia.org/resource/UTC+12:00>
http://dbpedia.org/resource/Donkey_Kong_Country_3:_Dixie_Kong's_Double_Trouble! http://dbpedi