In [1]:
from typing import List, Dict, Tuple
from SPARQLWrapper import SPARQLWrapper2
from collections import namedtuple
import re
from time import sleep
from urllib.error import HTTPError
from SPARQLWrapper.SPARQLExceptions import EndPointInternalError
import os
import json
from tqdm import tqdm
import time
from preprocessing.utils import extract_name_from_yago_uri

In [2]:
Predicate = namedtuple("Predicate", ["uri", "kb_name", "relation"])
DATA_ROOT = "."
PRED_URI_TO_SO_PAIRS_PATH = os.path.join(DATA_ROOT, "yago_pred_uri_to_so_pairs_randomized_1k.json")
YAGO_QEC_PATH = os.path.join(DATA_ROOT, "yago_qec.json")
TRY_QUERYING_MISSING_PREDS = False

In [3]:
##################################
# 1. Get all relevant predicates #
##################################
sparql = SPARQLWrapper2("https://yago-knowledge.org/sparql/query")
query_p = """
    SELECT DISTINCT ?p WHERE {
     ?s ?p ?obj . 
    }  ORDER BY ?p
"""

# Sparql query
sparql.setQuery(query_p)

# Adding values
relevant_preds: List[Predicate] = []
ineligible_relations = ["schema#fromClass", "schema#fromProperty", "logo", "image"]

for result in sparql.query().bindings:
    uri = result["p"].value
    kb_name, relation = extract_name_from_yago_uri(uri)
    if kb_name != "w3" and relation not in ineligible_relations:
        relevant_preds.append(Predicate(uri=uri, kb_name=kb_name, relation=relation))

In [4]:
#######################################################################
# 2. Extract all subject-object pairs for each pred in relevant preds #
#######################################################################
def get_so_pairs_for_pred(pred: Predicate) -> List[Tuple[str, str]]:
    start_time = time.time()
    print(f"Querying {pred.uri}.")
    so_pairs = []

    query = """
SELECT DISTINCT  ?output_s ?output_obj WHERE {{
  ?s <{}> ?obj . # which predicate to use
  OPTIONAL {{ 
    ?s rdfs:label ?s_label .
  	FILTER (LANG(?s_label) = 'en')
  }} # Get the label (name) for the subject if it exists
  BIND(COALESCE(?s_label, ?s) AS ?output_s) # if the label does not exist, stick with the URI
  
  OPTIONAL {{
    ?obj rdfs:label ?obj_label . 
    FILTER (LANG(?obj_label) = 'en')
  }} # Get the label (name) for the object if it exists
  OPTIONAL {{
    ?obj rdf:type ?obj_type . 
    ?obj_type rdfs:label ?obj_type_name .
    FILTER (LANG(?obj_type_name) = 'en')
  }} # get the name of the type for the object if the type exists
  BIND(COALESCE(IF(STR(?obj_label) != "Generic instance", ?obj_label, ?obj_type_name), ?obj) AS ?output_obj) 
  # if the label exists, go with the label, but if it's "Generic instance", then go with the type; if that does not exist, then stick with the OG object.
  BIND(RAND() AS ?sortKey)
}}
ORDER BY ?sortKey
LIMIT 1000
""".format(
        pred.uri
    )

    try:
        sparql.setQuery(query)
        for result in sparql.query().bindings:
            so_pairs.append((result["output_s"].value, result["output_obj"].value))
    # sleep(1)
    except (HTTPError, EndPointInternalError) as e:
        print(f"HTTPerror for uri {pred.uri}. Skipping.")
        so_pairs = None
    finally:
        time_elapsed = time.time() - start_time
        print(f"Time elapsed: {time_elapsed}")
        return so_pairs

In [5]:
if os.path.exists(PRED_URI_TO_SO_PAIRS_PATH):
    with open(PRED_URI_TO_SO_PAIRS_PATH) as f:
        pred_uri_to_so_pairs = json.load(f)
else:
    pred_to_so_pairs: Dict[Predicate, List[Tuple[str, str]]] = {
        pred: get_so_pairs_for_pred(pred) for pred in tqdm(relevant_preds)
    }
    pred_to_so_pairs = {k: v for k, v in pred_to_so_pairs.items() if v is not None}
    pred_uri_to_so_pairs = {
        k.uri: v for k, v in pred_to_so_pairs.items() if v is not None
    }

if TRY_QUERYING_MISSING_PREDS:
    missing_preds = [p for p in relevant_preds if p.uri not in pred_uri_to_so_pairs]
    print("Missing preds:", missing_preds)
    missing_pred_to_so_pairs: Dict[Predicate, List[Tuple[str, str]]] = {
        pred: get_so_pairs_for_pred(pred) for pred in tqdm(missing_preds)
    }
    missing_pred_to_so_pairs = {k: v for k, v in missing_pred_to_so_pairs.items() if v is not None}
    missing_pred_uri_to_so_pairs = {
        k.uri: v for k, v in missing_pred_to_so_pairs.items() if v is not None
    }

    pred_uri_to_so_pairs = {**missing_pred_uri_to_so_pairs, **pred_uri_to_so_pairs}

with open(PRED_URI_TO_SO_PAIRS_PATH, "w", encoding='utf-8') as fp:
    json.dump(pred_uri_to_so_pairs, fp, ensure_ascii=False, indent=4)

def augment_pred_uri_to_so_pairs_with_reverse(pred_uri_to_so_pairs):
    return {
        **pred_uri_to_so_pairs,
        **{
            f"reverse-{k}": [(y, x) for (x, y) in v]
            for k, v in pred_uri_to_so_pairs.items()
        },
    }

pred_uri_to_so_pairs_with_reverse = augment_pred_uri_to_so_pairs_with_reverse(
    pred_uri_to_so_pairs
)

In [6]:
########################################################################################################
# 4. Construct queries containing entities, corresponding answers, query forms, and context templates. #
########################################################################################################
from yago_questions import yago_topic_to_qfs

keys = set(yago_topic_to_qfs).intersection(set(pred_uri_to_so_pairs_with_reverse))
yago_qec = {
    k: {
        "query_forms": yago_topic_to_qfs[k],
        "entities": list(zip(*pred_uri_to_so_pairs_with_reverse[k]))[0],
        "answers": list(zip(*pred_uri_to_so_pairs_with_reverse[k]))[1],
        "context_templates": [yago_topic_to_qfs[k]["open"][-1] + " {answer}.\n"],
    }
    for k in keys
}

with open(YAGO_QEC_PATH, "w", encoding='utf-8') as fp:
    json.dump(yago_qec, fp, ensure_ascii=False, indent=4)

In [7]:
with open(YAGO_QEC_PATH, "r") as fp:
    yago_qec_reloaded = json.load(fp)

In [8]:
yago_qec_reloaded

{'http://schema.org/author': {'query_forms': {'closed': ["Q: Is {answer} an author of '{entity}'?\nA:",
    "Q: Was '{entity}' authored by {answer}?\nA:"],
   'open': ['Q: Who authored {entity}?\nA:', '{entity} was authored by']},
  'entities': ['Dolph',
   'Dr. Nick',
   'Nelson Muntz',
   'Lenny Leonard',
   'Agnes Skinner',
   "Bart Simpson's Guide to Life",
   'Dr. Hibbert',
   'Kang and Kodos',
   'Futurama',
   'Bart Simpson',
   'Leela',
   'Lisa Simpson',
   'Maggie Simpson',
   'Marge Simpson',
   'Philip J. Fry',
   'Milhouse Van Houten',
   'Troy McClure',
   'The Simpsons Uncensored Family Album',
   'Captain Horatio McCallister',
   'Disenchantment',
   'The Simpsons',
   'Homer Simpson',
   'Ralph Wiggum',
   'Principal Skinner',
   'Otto Mann',
   'The Simpsons episode guides',
   'Groundskeeper Willie',
   'Moe Szyslak',
   'Chief Wiggum',
   'Edna Krabappel',
   'Ned Flanders',
   'Van Houten family',
   'Mr. Burns',
   'Bleeding Gums Murphy',
   'Grampa Simpson',
   "

In [9]:
s = [x for x in yago_qec_reloaded['http://schema.org/icaoCode']["entities"] if "Egilssta" in x][0]
print(s, type(s))

Egilsstaðir Airport <class 'str'>
