In [1]:
import rdflib
import pandas as pd
import numpy as np
import csv
import sklearn.metrics


In [2]:
WD = rdflib.Namespace('http://www.wikidata.org/entity/')
WDT = rdflib.Namespace('http://www.wikidata.org/prop/direct/')
DDIS = rdflib.Namespace('http://ddis.ch/atai/')
RDFS = rdflib.namespace.RDFS
SCHEMA = rdflib.Namespace('http://schema.org/')

In [3]:
WDT["P58"]

rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P58')

In [4]:
graph = rdflib.Graph().parse('data/14_graph.nt', format='turtle')

In [5]:
entity_emb = np.load('../data/entity_embeds.npy')
relation_emb = np.load('../data/relation_embeds.npy')

with open('../data/entity_ids.del', 'r') as ifile:
    ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
with open('../data/relation_ids.del', 'r') as ifile:
    rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2rel = {v: k for k, v in rel2id.items()}

In [6]:
ent2lbl = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}

In [7]:
# let's see what our graph thinks the occupation of Jean Van Hamme is
professions = set(graph.query('''
    prefix wdt: <http://www.wikidata.org/prop/direct/>
    prefix wd: <http://www.wikidata.org/entity/>
    
    SELECT ?obj ?lbl WHERE {
        ?ent rdfs:label "Jean Van Hamme"@en .
        ?ent wdt:P106 ?obj .
        ?obj rdfs:label ?lbl .
    }
    '''))
{ent[len(WD):]: str(lbl) for ent, lbl in professions}

{'Q329737': 'butcher'}

In [8]:
# "Jean Van Hamme" entity
head = entity_emb[ent2id[WD.Q428160]]
# "occupation" relation
pred = relation_emb[rel2id[WDT.P106]]
# add vectors according to TransE scoring function.
lhs = head + pred
# compute distance to *any* entity
dist = sklearn.metrics.pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
# find most plausible entities
most_likely = dist.argsort()
# compute ranks of entities
ranks = dist.argsort().argsort()

In [9]:
# what would be more plausible occupations?
pd.DataFrame([
    (id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
    for rank, idx in enumerate(most_likely[:10])],
    columns=('Entity', 'Label', 'Score', 'Rank'))

Unnamed: 0,Entity,Label,Score,Rank
0,Q36180,writer,3223.151367,1
1,Q33999,actor,3382.129883,2
2,Q6625963,novelist,3397.58252,3
3,Q1930187,journalist,3492.735352,4
4,Q4610556,model,3549.961182,5
5,Q639669,musician,3616.397949,6
6,Q67311526,Obalky knih.cz,3628.230957,7
7,Q150,French,3667.630615,8
8,Q1028181,painter,3694.66748,9
9,Q245068,comedian,3744.303955,10


# Recovering

The Masked Gang: Cyprus (Q7750525) does not containt screenwriter (P58):

In [7]:
set(graph.objects(WD.Q7750525, WDT.P58))

set()

In [12]:
# Let's see if we can recover this from embeddings...

# set the head entity to bridge
head = entity_emb[ent2id[WD.Q7750525]]
# let's try with wdt:P279 (subClassOf) first
pred = relation_emb[rel2id[WDT.P58]]
# combine according to the TransE scoring function
lhs = head + pred
# compute distance to *any* entity
dist = sklearn.metrics.pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)
# find most plausible tails
most_likely = dist.argsort()[0]
print(most_likely)
entity = id2ent[most_likely]
str(entity)
# print(entity)
# show most likely entities
# pd.DataFrame([
#     (id2ent[idx][len(WD):], ent2lbl[id2ent[idx]], dist[idx], rank+1)
#     for rank, idx in enumerate(most_likely[:10])],
#     columns=('Entity', 'Label', 'Score', 'Rank'))

102605


'http://www.wikidata.org/entity/Q5058838'

# Entity Similarity

In [31]:
# which entities are similar to "Harry Potter and the Goblet of Fire"
ent = ent2id[WD.Q44578]
# we compare the embedding of the query entity to all other entity embeddings
dist = sklearn.metrics.pairwise_distances(entity_emb[ent].reshape(1, -1), entity_emb).reshape(-1)
# order by plausibility
most_likely = dist.argsort()

pd.DataFrame([
    (
        id2ent[idx][len(WD):], # qid
        ent2lbl[id2ent[idx]],  # label
        dist[idx],             # score
        rank+1,                # rank
    )
    for rank, idx in enumerate(most_likely[:15])],
    columns=('Entity', 'Label', 'Score', 'Rank'))

Unnamed: 0,Entity,Label,Score,Rank
0,Q44578,Titanic,0.0,1
1,Q24871,Avatar,2266.246094,2
2,Q170564,Terminator 2: Judgment Day,2386.475342,3
3,Q46717,Pirates of the Caribbean: The Curse of the Bla...,2396.940674,4
4,Q128518,Gladiator,2398.77124,5
5,Q162255,The Terminator,2409.545898,6
6,Q104814,Aliens,2429.442627,7
7,Q103569,Alien,2432.364014,8
8,Q193573,Moulin Rouge!,2442.012939,9
9,Q463313,Romeo + Juliet,2445.643555,10
