In [1]:
%load_ext lab_black

In [2]:
from typing import List, Dict, Tuple

In [3]:
from SPARQLWrapper import SPARQLWrapper2

# sparql = SPARQLWrapper2("http://dbpedia.org/sparql")
sparql = SPARQLWrapper2("https://yago-knowledge.org/sparql/query")

query_p = """
    SELECT DISTINCT ?p WHERE {
     ?s ?p ?obj . 
    }  ORDER BY ?p
"""

query = """PREFIX yago: <http://yago-knowledge.org/resource/>
    SELECT * WHERE {
        ?s yago:capital ?obj .
    } 
    LIMIT 100"""

query_l = """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX yago: <http://yago-knowledge.org/resource/>

SELECT ?label ?lang
WHERE {
  yago:United_Arab_Emirates rdfs:label ?label . 
  BIND(LANG(?label) AS ?lang)
#   FILTER (LANG(?label) = 'en')
} ORDER BY ?label"""

query_uri = """PREFIX yago: <http://yago-knowledge.org/resource/>

SELECT (strafter(str(?entityURI), str(yago:)) AS ?yagoEntity)
WHERE {
  BIND(<http://yago-knowledge.org/resource/United_Arab_Emirates> AS ?entityURI)
}"""
# Sparql query
sparql.setQuery(query_uri)


for result in sparql.query().bindings:
    print(result)

{'yagoEntity': Value(literal:'United_Arab_Emirates')}


In [4]:
##################################
# 1. Get all relevant predicates #
##################################
from SPARQLWrapper import SPARQLWrapper2

# sparql = SPARQLWrapper2("http://dbpedia.org/sparql")
sparql = SPARQLWrapper2("https://yago-knowledge.org/sparql/query")

query_p = """
    SELECT DISTINCT ?p WHERE {
     ?s ?p ?obj . 
    }  ORDER BY ?p
"""

# Sparql query
sparql.setQuery(query_p)


def extract_name_from_uri(uri):
    import re

    # pattern = r"http://([^\/]+)\.org\/([^\/]+)$"
    pattern = r"http://(?:www\.)?([^\/]+)\.org\/(.+)$"

    matches = re.match(pattern, uri)

    if matches:
        kb_domain = matches.group(1)
        relation = matches.group(2)
    else:
        raise ValueError(
            f"Could not find match containing kb_domain and relation for uri {uri}."
        )

    domain_to_name = {"schema": "schema", "yago-knowledge": "yago", "w3": "w3"}

    kb_name = domain_to_name[kb_domain]

    return kb_name, relation
    # return f"{kb_name}:{relation}"


from collections import namedtuple

Predicate = namedtuple("Predicate", ["uri", "kb_name", "relation"])

# Adding values
relevant_preds: List[Predicate] = []
ineligible_relations = ["schema#fromClass", "schema#fromProperty", "logo", "image"]

for result in sparql.query().bindings:
    uri = result["p"].value
    kb_name, relation = extract_name_from_uri(uri)
    if kb_name != "w3" and relation not in ineligible_relations:
        relevant_preds.append(Predicate(uri=uri, kb_name=kb_name, relation=relation))

print(relevant_preds)
print([p.uri for p in relevant_preds])

[Predicate(uri='http://schema.org/about', kb_name='schema', relation='about'), Predicate(uri='http://schema.org/actor', kb_name='schema', relation='actor'), Predicate(uri='http://schema.org/address', kb_name='schema', relation='address'), Predicate(uri='http://schema.org/administrates', kb_name='schema', relation='administrates'), Predicate(uri='http://schema.org/affiliation', kb_name='schema', relation='affiliation'), Predicate(uri='http://schema.org/alternateName', kb_name='schema', relation='alternateName'), Predicate(uri='http://schema.org/alumniOf', kb_name='schema', relation='alumniOf'), Predicate(uri='http://schema.org/area', kb_name='schema', relation='area'), Predicate(uri='http://schema.org/author', kb_name='schema', relation='author'), Predicate(uri='http://schema.org/award', kb_name='schema', relation='award'), Predicate(uri='http://schema.org/birthDate', kb_name='schema', relation='birthDate'), Predicate(uri='http://schema.org/birthPlace', kb_name='schema', relation='birth

In [5]:
#######################################################################
# 2. Extract all subject-object pairs for each pred in relevant preds #
#######################################################################
from time import sleep
from urllib.error import HTTPError
from SPARQLWrapper.SPARQLExceptions import EndPointInternalError
import os
import json
from tqdm import tqdm
import time


def get_so_pairs_for_pred(pred: Predicate) -> List[Tuple[str, str]]:
    start_time = time.time()
    print(f"Querying {pred.uri}.")
    so_pairs = []
    #     query = """
    # PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    # SELECT DISTINCT ?s_label ?obj_label WHERE {{
    #     ?s <{}> ?obj .
    #     ?s rdfs:label ?s_label .
    #     ?obj rdfs:label ?obj_label .
    #     BIND(LANG(?s_label) AS ?s_lang)
    #     FILTER (LANG(?s_label) = 'en')
    #     BIND(LANG(?obj_label) AS ?obj_lang)
    #     FILTER (LANG(?obj_label) = 'en')
    # }}
    #     LIMIT 1000
    #     """.format(
    #         pred.uri
    #     )

    query = """
SELECT DISTINCT  ?output_s ?output_obj WHERE {{
  ?s <{}> ?obj . # which predicate to use
  OPTIONAL {{ 
    ?s rdfs:label ?s_label .
  	FILTER (LANG(?s_label) = 'en')
  }} # Get the label (name) for the subject if it exists
  BIND(COALESCE(?s_label, ?s) AS ?output_s) # if the label does not exist, stick with the URI
  
  OPTIONAL {{
    ?obj rdfs:label ?obj_label . 
    FILTER (LANG(?obj_label) = 'en')
  }} # Get the label (name) for the object if it exists
  OPTIONAL {{
    ?obj rdf:type ?obj_type . 
    ?obj_type rdfs:label ?obj_type_name .
    FILTER (LANG(?obj_type_name) = 'en')
  }} # get the name of the type for the object if the type exists
  BIND(COALESCE(IF(STR(?obj_label) != "Generic instance", ?obj_label, ?obj_type_name), ?obj) AS ?output_obj) 
  # if the label exists, go with the label, but if it's "Generic instance", then go with the type; if that does not exist, then stick with the OG object.
  BIND(RAND() AS ?sortKey)
}}
ORDER BY ?sortKey
LIMIT 1000
""".format(
        pred.uri
    )
    # print(query)
    # query = f"SELECT ?s ?obj WHERE {{?s <{pred.uri}> ?obj .}}"
    try:
        sparql.setQuery(query)
        for result in sparql.query().bindings:
            so_pairs.append((result["output_s"].value, result["output_obj"].value))
    # sleep(1)
    except (HTTPError, EndPointInternalError) as e:
        print(f"HTTPerror for uri {pred.uri}. Skipping.")
        so_pairs = None
    finally:
        time_elapsed = time.time() - start_time
        print(f"Time elapsed: {time_elapsed}")
        return so_pairs


# print(len(relevant_preds))

# get_so_pairs_for_pred(relevant_preds[0])
# PRED_URI_TO_SO_PAIRS_PATH = "yago_pred_uri_to_so_pairs.json"
PRED_URI_TO_SO_PAIRS_PATH = "yago_pred_uri_to_so_pairs_randomized_1k.json"

pred_uri_to_so_pairs = dict()
if os.path.exists(PRED_URI_TO_SO_PAIRS_PATH):
    with open(PRED_URI_TO_SO_PAIRS_PATH) as f:
        pred_uri_to_so_pairs = json.load(f)

missing_preds = [p for p in relevant_preds if p.uri not in pred_uri_to_so_pairs]
print(missing_preds)
missing_pred_to_so_pairs: Dict[Predicate, List[Tuple[str, str]]] = {
    pred: get_so_pairs_for_pred(pred) for pred in tqdm(missing_preds)
}
missing_pred_to_so_pairs = {k: v for k, v in missing_pred_to_so_pairs.items() if v is not None}
missing_pred_uri_to_so_pairs = {
    k.uri: v for k, v in missing_pred_to_so_pairs.items() if v is not None
}

pred_uri_to_so_pairs = {**missing_pred_uri_to_so_pairs, **pred_uri_to_so_pairs}

print(len(pred_uri_to_so_pairs), len(relevant_preds))

[Predicate(uri='http://schema.org/actor', kb_name='schema', relation='actor'), Predicate(uri='http://schema.org/alternateName', kb_name='schema', relation='alternateName'), Predicate(uri='http://schema.org/alumniOf', kb_name='schema', relation='alumniOf'), Predicate(uri='http://schema.org/birthDate', kb_name='schema', relation='birthDate'), Predicate(uri='http://schema.org/birthPlace', kb_name='schema', relation='birthPlace'), Predicate(uri='http://schema.org/deathDate', kb_name='schema', relation='deathDate'), Predicate(uri='http://schema.org/deathPlace', kb_name='schema', relation='deathPlace'), Predicate(uri='http://schema.org/gender', kb_name='schema', relation='gender'), Predicate(uri='http://schema.org/geo', kb_name='schema', relation='geo'), Predicate(uri='http://schema.org/knowsLanguage', kb_name='schema', relation='knowsLanguage'), Predicate(uri='http://schema.org/location', kb_name='schema', relation='location'), Predicate(uri='http://schema.org/mainEntityOfPage', kb_name='sc

  0%|          | 0/16 [00:00<?, ?it/s]

Querying http://schema.org/actor.


  6%|▋         | 1/16 [01:00<15:10, 60.70s/it]

HTTPerror for uri http://schema.org/actor. Skipping.
Time elapsed: 60.69924974441528
Querying http://schema.org/alternateName.


 12%|█▎        | 2/16 [02:01<14:09, 60.65s/it]

HTTPerror for uri http://schema.org/alternateName. Skipping.
Time elapsed: 60.61720538139343
Querying http://schema.org/alumniOf.


 19%|█▉        | 3/16 [03:02<13:08, 60.67s/it]

HTTPerror for uri http://schema.org/alumniOf. Skipping.
Time elapsed: 60.68319296836853
Querying http://schema.org/birthDate.


 25%|██▌       | 4/16 [04:02<12:07, 60.66s/it]

HTTPerror for uri http://schema.org/birthDate. Skipping.
Time elapsed: 60.651930809020996
Querying http://schema.org/birthPlace.


 31%|███▏      | 5/16 [05:03<11:06, 60.62s/it]

HTTPerror for uri http://schema.org/birthPlace. Skipping.
Time elapsed: 60.5497932434082
Querying http://schema.org/deathDate.


 38%|███▊      | 6/16 [06:03<10:05, 60.60s/it]

HTTPerror for uri http://schema.org/deathDate. Skipping.
Time elapsed: 60.553993225097656
Querying http://schema.org/deathPlace.


 44%|████▍     | 7/16 [07:04<09:05, 60.58s/it]

HTTPerror for uri http://schema.org/deathPlace. Skipping.
Time elapsed: 60.53104758262634
Querying http://schema.org/gender.


 50%|█████     | 8/16 [08:04<08:04, 60.58s/it]

HTTPerror for uri http://schema.org/gender. Skipping.
Time elapsed: 60.59235453605652
Querying http://schema.org/geo.


 56%|█████▋    | 9/16 [09:05<07:03, 60.57s/it]

HTTPerror for uri http://schema.org/geo. Skipping.
Time elapsed: 60.54056763648987
Querying http://schema.org/knowsLanguage.


 62%|██████▎   | 10/16 [10:06<06:03, 60.58s/it]

HTTPerror for uri http://schema.org/knowsLanguage. Skipping.
Time elapsed: 60.58627128601074
Querying http://schema.org/location.


 69%|██████▉   | 11/16 [11:06<05:02, 60.59s/it]

HTTPerror for uri http://schema.org/location. Skipping.
Time elapsed: 60.628783226013184
Querying http://schema.org/mainEntityOfPage.


 75%|███████▌  | 12/16 [12:07<04:02, 60.62s/it]

HTTPerror for uri http://schema.org/mainEntityOfPage. Skipping.
Time elapsed: 60.681299686431885
Querying http://schema.org/memberOf.


 81%|████████▏ | 13/16 [13:07<03:01, 60.59s/it]

HTTPerror for uri http://schema.org/memberOf. Skipping.
Time elapsed: 60.52875280380249
Querying http://schema.org/nationality.


 88%|████████▊ | 14/16 [13:35<01:41, 50.58s/it]

Time elapsed: 27.434287071228027
Querying http://schema.org/parentTaxon.


In [17]:
{k: v[:5] for k, v in list(pred_uri_to_so_pairs.items())[:5]}

{'http://schema.org/about': [['A Greek–English Lexicon', 'Ancient Greek'],
  ['Diccionario Griego-Español', 'Ancient Greek'],
  ['Etymologicum Magnum', 'Ancient Greek'],
  ['Ilkeät sisarpuolet', 'Helsinki'],
  ['Ruma kreivitär', 'Helsinki']],
 'http://schema.org/address': [['Paramount', '218 Yu Yuan Lu, Shanghai'],
  ['House at the Stone Bell', 'Staroměstské náměstí 605/13'],
  ['House at the Stone Bell', 'Týnská 605/1'],
  ['Caransebeș', 'Piaţa Revoluţiei Nr. 1, Caransebeş, Judeţul Caraş-Severin'],
  ['Centre Dürrenmatt Neuchâtel', 'Ch. du Pertuis-de-Sault 74']],
 'http://schema.org/administrates': [["People's Republic of China", 'Beijing'],
  ['Districts under Central Government Jurisdiction', 'Dushanbe'],
  ['Brandenburg', 'Potsdam'],
  ['Potsdam District', 'Potsdam'],
  ['Darmstadt Government Region', 'Frankfurt am Main']],
 'http://schema.org/affiliation': [['Elizabeth Blackburn', 'Yale University'],
  ['Mildred Allen', 'Yale University'],
  ['Robert J. Shiller', 'Yale University'

In [7]:
with open(PRED_URI_TO_SO_PAIRS_PATH, "w") as fp:
    json.dump(pred_uri_to_so_pairs, fp)

In [15]:
def augment_pred_uri_to_so_pairs_with_reverse(pred_uri_to_so_pairs):
    return {
        **pred_uri_to_so_pairs,
        **{
            f"reverse-{k}": [(y, x) for (x, y) in v]
            for k, v in pred_uri_to_so_pairs.items()
        },
    }

pred_uri_to_so_pairs_with_reverse = augment_pred_uri_to_so_pairs_with_reverse(
    pred_uri_to_so_pairs
)

In [12]:
irrelevant_preds = [
    "http://schema.org/about",
    "http://schema.org/address",
    "http://schema.org/area",
    "http://schema.org/dateCreated",
    "http://schema.org/demonym",
    "http://schema.org/dissolutionDate",
    "http://schema.org/duns",
    "http://schema.org/duration",
    "http://schema.org/elevation",
    "http://schema.org/endDate",
    "http://schema.org/geo",
    "http://schema.org/gtin",
    "http://schema.org/humanDevelopmentIndex",
    "http://schema.org/iataCode",
    "http://schema.org/icaoCode",
    "http://schema.org/isbn",
    "http://schema.org/iswcCode",
    "http://schema.org/leiCode",
    "http://schema.org/logo",
    "http://schema.org/motto",
    "http://schema.org/numberOfEmployees",
    "http://schema.org/numberOfEpisodes",
    "http://schema.org/numberOfPages",
    "http://schema.org/numberOfSeasons",
    "http://schema.org/populationNumber",
    "http://schema.org/postalCode",
    "http://schema.org/startDate",
    "http://schema.org/unemploymentRate",
    "http://schema.org/url",
    "http://yago-knowledge.org/resource/distanceFromEarth",
    "http://yago-knowledge.org/resource/length",
    "http://yago-knowledge.org/resource/luminosity",
    "http://yago-knowledge.org/resource/mass",
    "http://yago-knowledge.org/resource/parallax",
    "http://yago-knowledge.org/resource/radialVelocity",
    "http://yago-knowledge.org/resource/sportNumber",
    "http://yago-knowledge.org/resource/studentsCount",
    "http://yago-knowledge.org/schema#fromClass",
    "http://yago-knowledge.org/schema#fromProperty",
]
generic_instances = [
    "http://schema.org/knowsLanguage",
    "http://schema.org/inLanguage",
    "http://yago-knowledge.org/resource/academicDegree",
    "http://yago-knowledge.org/resource/beliefSystem",
]

print(
    # len(
    {
        k: v[:5]
        for k, v in pred_uri_to_so_pairs.items()
        # if k not in irrelevant_preds + generic_instances
    }
    # )
)
print(len(irrelevant_preds))
print(len(pred_uri_to_so_pairs))
{
    k: v[:5]
    for k, v in pred_uri_to_so_pairs.items()
    # if k not in irrelevant_preds + generic_instances
}

{'http://schema.org/about': [['Axis leaders of World War II', 'Axis Powers'], ['Popoli', 'Generic instance'], ['Le Pape', 'Generic instance'], ['Alias', 'Central Intelligence Agency'], ['United Nations Security Council Resolution 2018', 'Economic Community of West African States']], 'http://schema.org/actor': [['More Than Life at Stake', 'Feliks Żukowski'], ['Czterej pancerni i pies', 'Franciszek Pieczka'], ['The Intouchables', 'François Cluzet'], ['More Than Life at Stake', 'August Kowalczyk'], ['More Than Life at Stake', 'Barbara Brylska']], 'http://schema.org/address': [], 'http://schema.org/administrates': [['Angola', 'Cabinda Province'], ['Antarctica', 'French Southern and Antarctic Lands'], ['Bonn', 'Bad Godesberg'], ['Arica y Parinacota Region', 'Arica Province'], ['Altona', 'Blankenese']], 'http://schema.org/affiliation': [['Ban Ki-moon', 'Generic instance'], ['Gwyneth Paltrow', 'Goop'], ['Alan Clark', 'SABMiller'], ['Alvin Eliot Roth', 'Stanford University'], ['Shantanu Naraye

{'http://schema.org/about': [['Axis leaders of World War II', 'Axis Powers'],
  ['Popoli', 'Generic instance'],
  ['Le Pape', 'Generic instance'],
  ['Alias', 'Central Intelligence Agency'],
  ['United Nations Security Council Resolution 2018',
   'Economic Community of West African States']],
 'http://schema.org/actor': [['More Than Life at Stake', 'Feliks Żukowski'],
  ['Czterej pancerni i pies', 'Franciszek Pieczka'],
  ['The Intouchables', 'François Cluzet'],
  ['More Than Life at Stake', 'August Kowalczyk'],
  ['More Than Life at Stake', 'Barbara Brylska']],
 'http://schema.org/address': [],
 'http://schema.org/administrates': [['Angola', 'Cabinda Province'],
  ['Antarctica', 'French Southern and Antarctic Lands'],
  ['Bonn', 'Bad Godesberg'],
  ['Arica y Parinacota Region', 'Arica Province'],
  ['Altona', 'Blankenese']],
 'http://schema.org/affiliation': [['Ban Ki-moon', 'Generic instance'],
  ['Gwyneth Paltrow', 'Goop'],
  ['Alan Clark', 'SABMiller'],
  ['Alvin Eliot Roth', 'Sta

In [31]:
############################################
# 3. Construct queries for each predicate. #
############################################
# This is hardcoded, probably.
{
    "http://schema.org/about": {
        "closed": [
            "Q: Is the work '{entity}' about {answer}?\nA:"
            "Q: Is {answer} the subject of the work '{entity}'?\nA:"
        ],
        "open": ["Q: What is the work '{}' about?\nA:", "The work '{}' is about"],
    },
    "http://schema.org/actor": {
        "closed:": [
            "Q: Is {entity} an actor or actress in the film '{answer}'?\nA:",
            "Q: Does the movie '{answer}' feature the actor or actress {entity}?",
        ],
        "open": [
            "Q: What is a movie that {} acts in?\nA:",
            "{} acts in the movie",
        ],
    },
    "movies": {
        "closed:": [
            "Q: Is {answer} an actor or actress in the movie '{entity}'?\nA:",
            "Q: Does the movie '{entity}' feature the actor or actress {answer}?",
        ],
        "open": [
            "Q: Who is an actor or actress in the movie '{}'?\nA:",
            "One actor or actress in the movie '{}' is",
        ],
    },
    "http://schema.org/address": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/administrates": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/affiliation": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/alternateName": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/alumniOf": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/area": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/author": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/award": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/birthDate": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/birthPlace": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/children": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/contentLocation": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/dateCreated": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/deathDate": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/deathPlace": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/demonym": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/director": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/dissolutionDate": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/duns": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/duration": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/editor": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/elevation": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/endDate": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/founder": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/gender": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/geo": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/gtin": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/highestPoint": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/homeLocation": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/humanDevelopmentIndex": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/iataCode": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/icaoCode": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/illustrator": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/inLanguage": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/influencedBy": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/isbn": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/iswcCode": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/knowsLanguage": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/leader": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/leiCode": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/location": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/locationCreated": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/lowestPoint": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/lyricist": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/mainEntityOfPage": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/manufacturer": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/material": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/memberOf": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/motto": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/musicBy": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/nationality": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/neighbors": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/numberOfEmployees": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/numberOfEpisodes": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/numberOfPages": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/numberOfSeasons": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/officialLanguage": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/organizer": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/ownedBy": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/owns": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/parentTaxon": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/performer": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/populationNumber": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/postalCode": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/productionCompany": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/publisher": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/recordLabel": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/sameAs": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/sponsor": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/spouse": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/startDate": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/superEvent": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/unemploymentRate": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/url": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://schema.org/worksFor": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/academicDegree": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/appearsIn": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/beliefSystem": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/candidateIn": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/capital": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/conferredBy": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/consumes": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/director": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/distanceFromEarth": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/doctoralAdvisor": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/flowsInto": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/follows": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/length": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/luminosity": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/mass": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/notableWork": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/parallax": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/parentBody": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/participant": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/playsIn": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/radialVelocity": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/replaces": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/sportNumber": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/studentOf": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/studentsCount": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
    "http://yago-knowledge.org/resource/terminus": {
        "closed": [],
        "open": ["Q:\nA:", "is"],
    },
}
pred_to_query_forms: Dict[str, List[str]] = {
    "yago:studentOf": ["Who is {} a student of?", "{} is a student of"],
}

In [19]:
########################################################################################################
# 4. Construct queries containing entities, corresponding answers, query forms, and context templates. #
########################################################################################################
from yago_questions import yago_topic_to_qfs

keys = set(yago_topic_to_qfs).intersection(set(pred_uri_to_so_pairs_with_reverse))
yago_qec = {
    k: {
        "query_forms": yago_topic_to_qfs[k],
        "entities": list(zip(*pred_uri_to_so_pairs_with_reverse[k]))[0],
        "answers": list(zip(*pred_uri_to_so_pairs_with_reverse[k]))[1],
        "context_templates": [yago_topic_to_qfs[k]["open"][-1] + " {answer}"],
    }
    for k in keys
}


In [None]:
# --> datasets.py build


def get_entities():
    return yago_qec["yago:studentOf"]["entities"]


def get_contexts():
    return [
        yago_qec["yago:studentOf"]["context_templates"].format(e)
        for e in yago_qec["yago:studentOf"]["entities"]
    ]

def get_queries():
    pass