# Embeddings for the DDIS Movie Graph

## Setup

In [None]:
# imports
import csv
import numpy as np
import os
import rdflib
import pandas as pd
from sklearn.metrics import pairwise_distances

In [None]:
# define some prefixes
WD = rdflib.Namespace('http://www.wikidata.org/entity/')
WDT = rdflib.Namespace('http://www.wikidata.org/prop/direct/')
DDIS = rdflib.Namespace('http://ddis.ch/atai/')
RDFS = rdflib.namespace.RDFS
SCHEMA = rdflib.Namespace('http://schema.org/')

## Load the data

In [None]:
# load the graph
graph = rdflib.Graph().parse(os.path.join('data', 'ddis-movie-graph.nt'), format='turtle')

In [None]:
# load the embeddings
entity_emb = np.load(os.path.join('data', 'entity_embeds.npy'))
relation_emb = np.load(os.path.join('data', 'relation_embeds.npy'))
entity_file = os.path.join('data', 'entity_ids.del')
relation_file = os.path.join('data', 'relation_ids.del')

In [None]:
# load the dictionaries
with open(entity_file, 'r') as ifile:
    ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
with open(relation_file, 'r') as ifile:
    rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2rel = {v: k for k, v in rel2id.items()}

In [None]:
ent2lbl = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}

## Find similar movies from embeddings

In [None]:
# Find the Wikidata ID for the movie (https://www.wikidata.org/wiki/Q132863 is the ID for "Finding Nemo")
# Choose your own movie
movie = WD['...']

# Find the movie in the graph
movie_id = ent2id[...]

# we compare the embedding of the query entity to all other entity embeddings
distances = pairwise_distances(entity_emb[movie_id].reshape(1, -1), entity_emb, metric='cosine').flatten()

# and sort them by distance
most_likely = ...

# we print rank, entity ID, entity label, and distance
for rank, idx in enumerate(most_likely[:20]):
    rank = rank + 1
    ent = ... # eg: http://www.wikidata.org/entity/Q132863 
    q_id = ent.split('/')[-1] # to convert 'http://www.wikidata.org/entity/Q132863' to 'Q132863'
    lbl = ... # eg: 'Finding Nemo'
    dist = ... # eg: 0.0

    print(f'{rank:2d}. {dist:.3f} {q_id:10s} {lbl}')

In [None]:
movie_emb = entity_emb[ent2id[movie]]

# Find the predicate (relation) of the genre (https://www.wikidata.org/wiki/Property:P136 is the ID for "genre")
genre = WDT['...']
genre_emb = relation_emb[rel2id[genre]]

# combine according to the TransE scoring function
lhs = ... + ...

# compute distance to *any* entity
distances = pairwise_distances(lhs.reshape(1, -1), entity_emb).reshape(-1)

# find most plausible tails
most_likely = ...

# show most likely entities
for rank, idx in enumerate(most_likely[:20]):
    rank = rank + 1
    ent = ... # eg: https://www.wikidata.org/wiki/Q157443
    q_id = ent.split('/')[-1]
    lbl = ... # eg: 'comedy film'
    dist = ... # eg: 3000.0

    print(f'{rank:2d}. {dist:.3f} {q_id:10s} {lbl}')