In [2]:
!pip install datasets
!pip install fastembed



In [3]:
from datasets import load_from_disk

# Load the dataset from disk
dataset = load_from_disk("filtered_first_1000")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
"Which party is Daniel Ermentrout affiliated with ?".replace("Daniel Ermentrout", "[9]")

'Which party is [9] affiliated with ?'

In [5]:
def mask_entities(dialog):
    for turn in dialog["turns"]:
        if turn["speaker"] != "USER":
            continue

        masked_utterance = turn["utterance"]
        for i, entity in enumerate(turn["verbalized_entities_in_utterance"]):
            masked_utterance = masked_utterance.replace(entity, f"[{i}]")

        turn["masked_utterance"] = masked_utterance
    return dialog


In [7]:
def get_entry(text):
# Yes QA_713:112 20
    _, id_, turn = text.split(" ")

    # get index of the dialog
    real_index = dataset["id"].index(id_)

    # get the dialog

    dialog = dataset[real_index]["turns"][int(turn)]
    return dialog


In [8]:
dataset[0]["turns"][0]["entities_in_utterance"]

['Q5217081']

In [9]:
def find_properties_for_entity(entity_id):
    return 'SELECT ?pLabel ?p WHERE { wd:' + entity_id + ' ?a ?b.  ?p wikibase:directClaim ?a . SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }}'

find_properties_for_entity("Q5217081")

'SELECT ?pLabel ?p WHERE { wd:Q5217081 ?a ?b.  ?p wikibase:directClaim ?a . SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }}'

In [10]:
# send a request to wikidata

query = find_properties_for_entity("Q5217081")

import requests

url = 'https://query.wikidata.org/sparql'
r = requests.get(url, params = {'query': query, 'format': 'json'})

data = r.json()

data

{'head': {'vars': ['pLabel', 'p']},
 'results': {'bindings': [{'p': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/P509'},
    'pLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'cause of death'}},
   {'p': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/P535'},
    'pLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'Find a Grave memorial ID'}},
   {'p': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/P569'},
    'pLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'date of birth'}},
   {'p': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/P570'},
    'pLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'date of death'}},
   {'p': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/P570'},
    'pLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'date of death'}},
   {'p': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/P646'},
    'pLabel': {'xml:lang': 'en', 'type': 'literal', 'value':

In [11]:
for binding in data["results"]["bindings"]:
    print(binding["pLabel"]["value"], binding["p"]["value"].split("/")[-1])
    

cause of death P509
Find a Grave memorial ID P535
date of birth P569
date of death P570
date of death P570
Freebase ID P646
Open Library ID P648
family name P734
given name P735
work location P937
US Congress Bio ID P1157
languages spoken, written or signed P1412
name in native language P1559
FAST ID P2163
Geni.com profile ID P2600
Prabook ID P3368
WorldCat Identities ID (superseded) P7859
Political Graveyard politician ID P8462
United States House of Representatives ID P11500
image P18
place of birth P19
sex or gender P21
country of citizenship P27
instance of P31
position held P39
position held P39
educated at P69
member of political party P102
occupation P106
occupation P106
place of burial P119
ISNI P213
VIAF ID P214
Library of Congress authority ID P244


In [12]:
from fastembed import TextEmbedding

In [13]:
embedding_model = TextEmbedding()

Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 19826.02it/s]


In [14]:
def create_embeddings(property_names):
    inputs = [f"Wikidata property name: {name}" for name in property_names]
    return embedding_model.embed(inputs)


property_names = [binding["pLabel"]["value"] for binding in data["results"]["bindings"]]
property_embeddings = list(create_embeddings(property_names))

In [15]:
property_ids = [
    binding["p"]["value"].split("/")[-1] for binding in data["results"]["bindings"]
]
property_ids

['P509',
 'P535',
 'P569',
 'P570',
 'P570',
 'P646',
 'P648',
 'P734',
 'P735',
 'P937',
 'P1157',
 'P1412',
 'P1559',
 'P2163',
 'P2600',
 'P3368',
 'P7859',
 'P8462',
 'P11500',
 'P18',
 'P19',
 'P21',
 'P27',
 'P31',
 'P39',
 'P39',
 'P69',
 'P102',
 'P106',
 'P106',
 'P119',
 'P213',
 'P214',
 'P244']

In [16]:
dataset[0]["turns"][0]["utterance"]

'Which party is Daniel Ermentrout affiliated with ?'

In [17]:
query_embedding = list(
    embedding_model.embed(["Which party is Daniel Ermentrout affiliated with ?"])
)[0]

In [18]:
import numpy as np


def print_top_k(query_embedding, embeddings, property_names, property_ids, k=5):
    # use numpy to calculate the cosine similarity between the query and the documents
    scores = np.dot(embeddings, query_embedding)
    # sort the scores in descending order
    sorted_scores = np.argsort(scores)[::-1]
    # print the top 5
    for i in range(k):
        print(
            f"Rank {i+1}: {property_names[sorted_scores[i]]} {property_ids[sorted_scores[i]]}"
        )


print_top_k(query_embedding, property_embeddings, property_names, property_ids)

Rank 1: member of political party P102
Rank 2: Political Graveyard politician ID P8462
Rank 3: US Congress Bio ID P1157
Rank 4: VIAF ID P214
Rank 5: United States House of Representatives ID P11500


In [19]:
def get_property_value_query(property_id, entity_id):
    return f'SELECT ?value ?valueLabel WHERE {{ wd:{entity_id} wdt:{property_id} ?value . SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} }}'

property_query = get_property_value_query("P102", "Q5217081")


r = requests.get(url, params = {'query': property_query, 'format': 'json'})

data = r.json()

data

{'head': {'vars': ['value', 'valueLabel']},
 'results': {'bindings': [{'value': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q29552'},
    'valueLabel': {'xml:lang': 'en',
     'type': 'literal',
     'value': 'Democratic Party'}}]}}

In [20]:
for binding in data["results"]["bindings"]:
    print(binding["valueLabel"]["value"], binding["value"]["value"].split("/")[-1])

Democratic Party Q29552


In [21]:
dataset[0]["turns"][1]["utterance"], dataset[0]["turns"][1]["entities_in_utterance"]

('Democratic Party', ['Q29552'])