In [9]:
import requests
import chromadb

from fastembed import TextEmbedding


In [6]:


query = """SELECT ?property ?propertyLabel ?propertyDesc WHERE {
    ?property a wikibase:Property .
    SERVICE wikibase:label {
      bd:serviceParam wikibase:language "en" .
      ?property schema:description ?propertyDesc .
      ?property rdfs:label ?propertyLabel .
   }
 }"""

url = 'https://query.wikidata.org/sparql'

r = requests.get(url, params = {'query': query, 'format': 'json'})

data = r.json()


In [14]:
data["results"]["bindings"][0]["propertyDesc"]

{'xml:lang': 'en',
 'type': 'literal',
 'value': 'head of the executive power of this town, city, municipality, state, country, or other governmental body'}

In [None]:
# extract into 3 lists : property, propertyLabel, propertyDesc

props = []
labels = []
descs = []

for item in data["results"]["bindings"]:
    if "propertyDesc" not in item:
        print(item)
        descs.append("")
    else:
        descs.append(item['propertyDesc']['value'])

    props.append(item['property']['value'])
    labels.append(item['propertyLabel']['value'])


In [12]:
model = TextEmbedding()

Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 41896.49it/s]




In [25]:
def prepare_embeddings(props, labels, descs):

    emb_prompts = [
        f"Wikidata property: {label} ({desc})" for label, desc in zip(labels, descs)
    ]

    embeddings = model.embed(emb_prompts)

    return embeddings

embeddings = list(prepare_embeddings(props, labels, descs))

    

In [5]:
client = chromadb.PersistentClient(path="../props_vector_db")

In [17]:
colection = client.create_collection("wikidata_properties")
# colection = client.get_collection("wikidata_properties")

In [34]:
embeddings[0].tolist()

[-0.006174369249492884,
 0.005588064435869455,
 0.06050939857959747,
 -0.054765526205301285,
 0.037459246814250946,
 0.01666261814534664,
 0.1484055370092392,
 -0.04147284850478172,
 -0.022853853181004524,
 0.061576809734106064,
 -0.03567548468708992,
 -0.06317933648824692,
 -0.03275105729699135,
 -0.003169655567035079,
 -0.029477577656507492,
 0.007349770050495863,
 -0.03480827435851097,
 0.017788385972380638,
 0.02960040606558323,
 0.05679330229759216,
 0.10352617502212524,
 0.022318484261631966,
 -0.025733938440680504,
 0.014975679107010365,
 0.023815834894776344,
 -0.024014851078391075,
 -0.011209270916879177,
 -0.010946623049676418,
 -0.03097260184586048,
 -0.06991128623485565,
 0.0193684883415699,
 -0.05373602360486984,
 -0.024300523102283478,
 0.03814404457807541,
 -8.181569910448161e-07,
 -0.02765263244509697,
 -0.01216935645788908,
 0.04901999980211258,
 -0.03336353227496147,
 -0.001970768440514803,
 0.04831106960773468,
 0.038648203015327454,
 0.02383076399564743,
 -0.0180182

In [35]:
embeddings= [embedding.tolist() for embedding in embeddings]

In [45]:
colection.add(
    embeddings = embeddings,
    ids = props,
    documents = [f"Wikidata property: {label} ({desc})" for label, desc in zip(labels, descs)],
    metadatas = [
        {"label": label, "desc": desc, "prop": prop.split("/")[-1]} for label, desc, prop in zip(labels, descs, props)
    ]
)

In [6]:
import datasets

dataset = datasets.load_from_disk("../simple_questions")

In [7]:
(
    dataset[0]["turns"][2]["utterance"],
    dataset[0]["turns"][2]["relations"],
    dataset[0]["turns"][2]["entities_in_utterance"],
)

('What were the positions for which Daniel Ermentrout were appointed ?',
 ['P39'],
 ['Q5217081'])

In [10]:
def find_properties_for_entity(entity_id):
    query = (
        "SELECT ?pLabel ?p WHERE { wd:"
        + entity_id
        + ' ?a ?b.  ?p wikibase:directClaim ?a . SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }}'
    )

    url = "https://query.wikidata.org/sparql"
    r = requests.get(url, params={"query": query, "format": "json"})
    data = r.json()
    props = []
    for binding in data["results"]["bindings"]:
        props.append(binding["p"]["value"].split("/")[-1])

    return props


restricted_props = find_properties_for_entity("Q5217081")

In [18]:
# query
query = "What were the positions for which Daniel Ermentrout were appointed ?"

query_embedding = list(model.embed([query]))[0].tolist()

result = colection.query(query_embedding, where={
    "prop":{
        "$in": restricted_props
    }
})

result

{'ids': [['http://www.wikidata.org/entity/P39',
   'http://www.wikidata.org/entity/P106',
   'http://www.wikidata.org/entity/P69',
   'http://www.wikidata.org/entity/P937',
   'http://www.wikidata.org/entity/P8462',
   'http://www.wikidata.org/entity/P1157',
   'http://www.wikidata.org/entity/P214',
   'http://www.wikidata.org/entity/P569',
   'http://www.wikidata.org/entity/P102',
   'http://www.wikidata.org/entity/P2163']],
 'distances': [[0.9757838249206543,
   1.0132941007614136,
   1.0224051475524902,
   1.03183114528656,
   1.0387098789215088,
   1.0429688692092896,
   1.0550448894500732,
   1.088775873184204,
   1.1193854808807373,
   1.119476079940796]],
 'metadatas': [[{'desc': 'subject currently or formerly holds the object position or public office',
    'label': 'position held',
    'prop': 'P39'},
   {'desc': 'occupation of a person; see also "field of work" (Property:P101), "position held" (Property:P39)',
    'label': 'occupation',
    'prop': 'P106'},
   {'desc': 'educa