# Entity label index

- What entity label index has best recall?

In [245]:
DB_PATH = 'data/dbpedia/2014_part/'
URI_PREFIX = 'http://dbpedia.org/resource/'
g = trident.Db(DB_PATH)
type_id = g.lookup_id('<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>')
label_id = g.lookup_id('<http://www.w3.org/2000/01/rdf-schema#label>')
assert label_id != None
print('Trident KB has %d terms, %d triples' % (g.n_terms(), g.n_triples()))
def try_lookup(uri):
    uri = util.quote(util.unquote(util.unquote(uri)))
    return g.lookup_id('<%s>' % uri) or -1

Trident KB has 30727783 terms, 99968667 triples


In [1]:
import pandas as pd
import os
gold_path = 't2k/data/v1/'
gold_prop_fname = os.path.join(gold_path,'gs_property.csv') # needed for gold subject column
gs_property = pd.read_csv(gold_prop_fname, header=None, names=['col', 'uri', 'gold'])
gs_property['table'], gs_property['columnIndex'] = zip(*(s.split('.csv~Col') for s in gs_property['col']))
gs_property['columnIndex'] = gs_property['columnIndex'].map(int)
key_cols = gs_property[(gs_property['uri'] == 'http://www.w3.org/2000/01/rdf-schema#label')][['table','columnIndex']]
key_cols = key_cols.set_index('table')['columnIndex'].to_dict()
print(f'Loaded {len(key_cols)} key columns')

Loaded 233 key columns


In [2]:
gs_instance_fname = os.path.join(gold_path,'gs_instance.csv')
gs_instance = pd.read_csv(gs_instance_fname, header=None, names=['row', 'uri', 'gold'])
print('%d instances' % len(gs_instance))

26124 instances


In [3]:
import sys, urllib.parse
REDIR = 'data/transitive_redirects_en.ttl'
redir = {}
for i,line in enumerate(open(REDIR)):
    source, _, target = line.strip()[:-2].split(None, 2)
    source, target = urllib.parse.unquote(source[1:-1]), urllib.parse.unquote(target[1:-1])
    redir[source] = target
    if 0 == i % 100000:
        print('Loading redirects line %d    ' % i, end='\r', file=sys.stderr)

Loading redirects line 7600000    

In [4]:
import glob
table_fnames = glob.glob('data_t2k/v1/tables/*.csv')
### LOAD TABLE CELLS ###
rowcontents = {}
table_col_isnumeric = {}
import os, csv, html, re
token_pattern = re.compile(r"(?u)\b\w+\b")
def tokenize(s):
    return tuple(s.split())
#     return tuple(token_pattern.findall(s.lower()))
def splitcells(cells):
    # T2KMatch table serialization reader
    cells = html.unescape(cells).replace('}','{').replace('NULL', '')
    cells = [c.split('|') for c in next(csv.reader([cells], delimiter='|', quotechar='{'))]
    cells = [set([tokenize(c) for c in cs]) for cs in cells]
    return cells
NUM = re.compile('[-0-9.]')
def isnumeric(col):
    col = [c for c in col if c]
    numratio = [(sum(1 for _ in NUM.finditer(c)) / len(c)) if c else 0 for c in col]
    return (((sum(numratio) / len(col)) if col else 0) > 0.5)
WHITESPACE = re.compile('\s+')
for fname in table_fnames:
    name = os.path.basename(fname)
    header, *rows = list(csv.reader(open(fname,'r')))
    rows = [[WHITESPACE.sub(' ',c) for c in r] for r in rows]
    cols = zip(*[[' '.join(c_ for cs in splitcells(cell) for c in cs for c_ in c) for cell in row] for row in rows])
    table_col_isnumeric[name] = {i: isnumeric(col) for i,col in enumerate(cols)}
    for i,row in enumerate(rows):
        rowcontents['%s~Row%d' % (name, i)] = [[' '.join(c) for cs in splitcells(cell) for c in cs] for cell in row]
print(f'Loaded {len(rowcontents)} rows')

Loaded 28333 rows


In [5]:
# Get all search queries
queries = set()
for i,row in gs_instance.iterrows():
    contents = rowcontents.get(row['row'], '')
    gold_uri = redir.get(row['uri'], row['uri'])
    table = row['row'].split('.csv~Row')[0]
    numeric = table_col_isnumeric.get('%s.csv'%table, {})
    keycol = key_cols.get(table, 0)
    key, rest = [], []
    for c,cell in enumerate(contents):
        if not numeric.get(c, False):
            if keycol == c:
                key += cell
            else:
                rest += cell
    key, rest = ' '.join(key), ' '.join(rest)
    queries.add( (key, rest, gold_uri, table) )
print(f'Loaded {len(queries)} queries')

Loaded 26047 queries


# Label index with links

In [258]:
from elasticsearch import Elasticsearch
import logging, random, util
es = Elasticsearch()
logging.getLogger('elasticsearch').setLevel(logging.CRITICAL)

# index='dbpedia2014_part_labeldocs_nosurface'
# index='dbpedia2014_part_labeldocs_surface'
# index='dbpedia2014_part_linklabeldocs_utf8fix'
# index='dbpedia_2014_labels_intl_labeldocs'
index='dbpedia2014_part_subjectdocs'
typ='linklabeldoc'

def query(key, rest):
    return {
        "query": {
            "bool": {
                "must": [
                    {"term":{"p": label_id }},
                ],
                "should":[
#                     {"match":{"label": {"query":key, "analyzer": "myanalyzer" }}},
#                     {"match_phrase":{"label": {"query":key, "slop":1, "analyzer": "myanalyzer"}}},
                    {"match":{"label": {"query":key }}},
                    {"match_phrase":{"label": {"query":key, "slop":1}}},
                ]
            }
        }
    }

def search(key, rest, size=50, cutoff=0.7):
    q = query(key, rest)
    q['size'] = size
    hits = es.search(index=index, body=q).get('hits', {}).get('hits', [])
    return hits

# generic
# key, rest = 'green lake','green lake county wisconsin united states'
# key, rest = 'edward jones','investment firm' 
# key,rest = 'medical physics',''

# only works with redir & disambig
key, rest = 'singer museum', 'netherlands'
# key, rest = 'south tyneside district hospital', ''

# messy
# key,rest = 'sth manchester univ. hospital', '' # bad
# key,rest = 'south manchester university hospital', '' # good
# key, rest = 'air madagascar',''
# key, rest = 'invitrogn', ''
# key, rest = 'ado fm', ''

hits = search(key, rest, size=100)
for i, hit in enumerate(hits):
#     print(f'{i:2d} {hit["_score"]:.2f}', hit['_source'])
#     print(f'{i:2d} {hit["_score"]:.2f}', '%30s %30s' % (hit['_source']['entity'], hit['_source']['label']))
    for entity in hit['_source']['ids']:
        if type(entity) == int:
            s, entity = entity, db.lookup_str(entity)[1:-1].split('/')[-1]
        else:
            s, entity = try_lookup(URI_PREFIX + entity), entity
        label = hit['_source']['label']
        uripart = util.unquote(util.unquote(entity))
        tokenjacc = util.tokenjaccard(label, key)
        print(f'{i:2d} {hit["_score"]:5.2f} {tokenjacc:.2f} {s:9d} {uripart[:30]:>30s} {label:30s}')

 0 33.45 1.00  28017146                   Singer_Laren singer museum                 
 1  9.40 0.25  20756129                  Danny_Worsnop danny worsnop (singer)        
 1  9.40 0.25  17855406              Asking_Alexandria danny worsnop (singer)        
 2  9.40 0.33  21470306                    Eric_Singer eric singer                   
 2  9.40 0.33  21470307     Eric_Singer_(graphologist) eric singer                   
 3  9.40 0.33  23668387         Joseph_Singer_(bishop) james singer                  
 3  9.40 0.33  23340094          James_Singer_(comics) james singer                  
 4  9.40 0.17  19278178 Category:Don_Moen_(singer)_liv don moen (singer) live albums 
 5  9.40 0.25  19280744    Category:Duffy_(singer)_EPs duffy (singer) eps            
 6  9.40 0.25  19256618   Category:Danish_singer_stubs danish singer stubs           
 7  9.40 0.17  19257622 Category:David_Ball_(country_s david ball (country singer) albums
 8  9.40 0.25  19284508 Category:Eamon_(singer)_al

## Attribute query

In [266]:
def att_query(att, subject_id):
    return {
        "query": {
            "bool": {
                "filter": [
                    {"term":{"ids": subject_id }},
                ],
                "should":[
#                     {"match":{"label": {"query":key, "analyzer": "myanalyzer" }}},
#                     {"match_phrase":{"label": {"query":key, "slop":1, "analyzer": "myanalyzer"}}},
                    {"match":{"label": {"query":att }}},
                    {"match_phrase":{"label": {"query":att, "slop":1}}},
                ]
            }
        }
    }

def att_search(att, subject_id, size=50, cutoff=0.7):
    q = att_query(att, subject_id)
    q['size'] = size
    hits = es.search(index=index, body=q).get('hits', {}).get('hits', [])
    for hit in hits:
        label = hit.get('_source', {}).get('label','')
        p = hit.get('_source', {}).get('p', -1)
        tokenjacc = util.tokenjaccard(label, att)
        if tokenjacc:
            yield tokenjacc, p, label

hits = att_search('netherlands', 28017146)
for tokenjacc, p, label in hits:
    print(f'{tokenjacc:5.2f}  {label[:40]:40s} {db.lookup_str(p)}')

 0.14  art museums and galleries in the netherl <http://purl.org/dc/terms/subject>


In [268]:
label_id

2

# Entity properties

In [167]:
DB_PATH = './data/dbpedia/2014_part/'
import trident, urllib
db = trident.Db(DB_PATH)
type_id = db.lookup_id('<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>')
label_id = db.lookup_id('<http://www.w3.org/2000/01/rdf-schema#label>')

# uripart = 'Atmospheric_Environment'
# uripart = 'Schloss_Wei%C3%9Fenstein'
uripart = 'Newnans_Lake'
SAFE = ",()':&!/+*%"
quote = lambda uri: urllib.parse.quote(uri, SAFE)
uripart = quote(uripart)
s = db.lookup_id(f'<http://dbpedia.org/resource/{uripart}>')
if s:
    for p,o in db.po(s):
        if p in [label_id]:
            print(f'{db.lookup_str(p):50s} {db.lookup_str(o):50s}')
else:
    print('not found')

<http://www.w3.org/2000/01/rdf-schema#label>       "Newnans Lake"@en                                 


In [199]:
!grep "rthersee" ../T2KMatch/data/surfaceforms.txt

w c 3 b 6 rthersee	lake w%c3%b6rther	woerthersee	worthersee
hypo arena	w\u00f6rtherseestadion	klagenfurt
p c 3 b 6 rtschach am w c 3 b 6 rthersee	p%c3%b6rtschach	poertschach	portschach
w c 3 b 6 rthersee stadion	w%c3%b6rtherseestadion	waidmannsdorf	wortherseestadion
ein schlo c 3 9 f am w c 3 b 6 rthersee	ein schloss am worthersee
velden am w c 3 b 6 rther see	velden am w%c3%b6rthersee	velden am woerther see	velden am woerthersee
