In [23]:
from SPARQLWrapper import SPARQLWrapper, JSON
import re
import pandas as pd
from tqdm import tqdm
from rdflib import Graph, URIRef, Literal, Namespace, RDF, RDFS

# -------------------------------
# 1. Setup DBpedia SPARQL endpoint
# -------------------------------
DBPEDIA_SPARQL = "https://dbpedia.org/sparql"
sparql = SPARQLWrapper(DBPEDIA_SPARQL)
sparql.setReturnFormat(JSON)

# Namespace shortcuts
DCT = Namespace("http://purl.org/dc/terms/")
SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
EX = Namespace("http://example.org/kg/")  # for our custom triples


# -------------------------------
# 2. Function: Get categories matching "Visitor_attractions_in_*"
# -------------------------------
def get_visitor_attraction_categories(limit=10):
    """
    Retrieve categories that start with Category:Visitor_attractions_in_
    For demo, limit results.
    """
    query = f"""
    SELECT DISTINCT ?category WHERE {{
      ?category a skos:Concept .
      FILTER regex(str(?category), "^http://dbpedia.org/resource/Category:Tourist_attractions_in_")
    }} LIMIT {limit}
    """
    sparql.setQuery(query)
    results = sparql.query().convert()
    return [r['category']['value'] for r in results['results']['bindings']]


# -------------------------------
# 3. Function: Get POIs for a given category
# -------------------------------
def get_pois_for_category(category_uri):
    query = f"""
    SELECT DISTINCT ?POI ?category WHERE {{
      ?POI <http://purl.org/dc/terms/subject> ?category .
      ?category <http://www.w3.org/2004/02/skos/core#broader> <{category_uri}> .
    }}
    """
    sparql.setQuery(query)
    results = sparql.query().convert()
    pois = []
    for r in results['results']['bindings']:
        poi = r['POI']['value']
        cat = r['category']['value']
        pois.append((poi, cat))
    return pois


# -------------------------------
# 4. Regex-based syntactic verification
# -------------------------------
CATEGORY_REGEX = re.compile(r"http://dbpedia\.org/resource/Category:(.+?)_(in|of)_(.+)")

def parse_category_uri(category_uri):
    m = CATEGORY_REGEX.match(category_uri)
    if m:
        type_str = m.group(1).replace("_", " ")
        location_str = m.group(3).replace("_", " ")
        return type_str, location_str
    return None, None



In [36]:
# -------------------------------
# 5. Main pipeline
# -------------------------------
all_records = []

categories = get_visitor_attraction_categories(limit=100)  # adjust limit
print(f"Found {len(categories)} categories")
print(categories)


Found 100 categories
['http://dbpedia.org/resource/Category:Tourist_attractions_in_Amsterdam', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Anaheim,_California', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Bergen_County,_New_Jersey', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Caddo_County,_Oklahoma', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Canada', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_County_Donegal', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Cumberland_County,_Nova_Scotia', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Dakota_County,_Minnesota', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Detroit', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Duval_County,_Florida', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Fairfax_County,_Virginia', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Fremont_Coun

In [37]:

for cat_uri in tqdm(categories, desc="Processing categories"):
    pairs = get_pois_for_category(cat_uri)
    for poi_uri, category_uri in pairs:
        type_str, location_str = parse_category_uri(category_uri)
        if type_str and location_str:
            all_records.append({
                "POI": poi_uri,
                "Category": category_uri,
                "Type": type_str,
                "Location": location_str
            })

df = pd.DataFrame(all_records)

Processing categories: 100%|██████████| 100/100 [00:10<00:00,  9.51it/s]


In [None]:
print(df[df["Location"] == "Bergen County, New Jersey"].head())

                                                   POI  \
241        http://dbpedia.org/resource/Baylor_Massacre   
242  http://dbpedia.org/resource/Holy_Cross_Cemeter...   
243  http://dbpedia.org/resource/Riverside_Cemetery...   
244  http://dbpedia.org/resource/George_Washington_...   
245    http://dbpedia.org/resource/Gethsemane_Cemetery   

                                              Category        Type  \
241  http://dbpedia.org/resource/Category:Cemeterie...  Cemeteries   
242  http://dbpedia.org/resource/Category:Cemeterie...  Cemeteries   
243  http://dbpedia.org/resource/Category:Cemeterie...  Cemeteries   
244  http://dbpedia.org/resource/Category:Cemeterie...  Cemeteries   
245  http://dbpedia.org/resource/Category:Cemeterie...  Cemeteries   

                      Location  
241  Bergen County, New Jersey  
242  Bergen County, New Jersey  
243  Bergen County, New Jersey  
244  Bergen County, New Jersey  
245  Bergen County, New Jersey  


In [40]:

# -------------------------------
# 6. Build a mini Knowledge Graph (RDF) locally
# -------------------------------
g = Graph()
g.bind("ex", EX)
g.bind("rdfs", RDFS)

for _, row in df.iterrows():
    poi_uri = URIRef(row["POI"])
    type_uri = EX[row["Type"].replace(" ", "_")]
    location_uri = EX[row["Location"].replace(" ", "_")]

    # Store relationships
    g.add((poi_uri, RDF.type, type_uri))
    g.add((poi_uri, EX.hasLocation, location_uri))

    # Add hierarchy (type subclassOf Visitor_attractions)
    g.add((type_uri, RDFS.subClassOf, EX.Visitor_attractions))

print(f"\nGenerated RDF triples: {len(g)}")

# Save graph to file
g.serialize(destination="visitor_attractions_kg.ttl", format="turtle")
print("Graph saved to visitor_attractions_kg.ttl")



Generated RDF triples: 10051
Graph saved to visitor_attractions_kg.ttl


In [41]:
g.parse("visitor_attractions_kg.ttl", format="turtle")
# Print a few example triples
print(f"Total triples: {len(g)}\n")
for i, triple in enumerate(g):
    print(triple)
    if i == 9:  # show first 10 triples only
        break


Total triples: 10051

(rdflib.term.URIRef('http://dbpedia.org/resource/Baylor_Massacre'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://example.org/kg/Cemeteries'))
(rdflib.term.URIRef('http://dbpedia.org/resource/Ambarrukmo_Plaza'), rdflib.term.URIRef('http://example.org/kg/hasLocation'), rdflib.term.URIRef('http://example.org/kg/Indonesia'))
(rdflib.term.URIRef('http://dbpedia.org/resource/List_of_monastic_houses_in_Bedfordshire'), rdflib.term.URIRef('http://example.org/kg/hasLocation'), rdflib.term.URIRef('http://example.org/kg/Bedfordshire'))
(rdflib.term.URIRef('http://dbpedia.org/resource/Lake_Ambussel'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://example.org/kg/Lakes'))
(rdflib.term.URIRef('http://dbpedia.org/resource/Adams_Park'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://example.org/kg/Sports_venues'))
(rdflib.term.U

In [42]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

DBPEDIA_SPARQL = "https://dbpedia.org/sparql"
sparql = SPARQLWrapper(DBPEDIA_SPARQL)
sparql.setReturnFormat(JSON)

# Example: Get mappings for all POIs we already discovered
def get_wikidata_mapping(poi_uri):
    query = f"""
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    SELECT DISTINCT ?wikidata WHERE {{
      <{poi_uri}> owl:sameAs ?wikidata .
      FILTER(STRSTARTS(STR(?wikidata), "http://www.wikidata.org/entity/"))
    }}
    """
    sparql.setQuery(query)
    results = sparql.query().convert()
    return [r["wikidata"]["value"] for r in results["results"]["bindings"]]

# Example test
poi = "http://dbpedia.org/resource/Palau_Reial_Major"
print(get_wikidata_mapping(poi))


['http://www.wikidata.org/entity/Q1116546']


In [None]:
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
wd_sparql = SPARQLWrapper(WIKIDATA_SPARQL)
wd_sparql.setReturnFormat(JSON)

def get_wikidata_types(wd_entity_uri):
    query = f"""
    SELECT DISTINCT ?type ?typeLabel WHERE {{
      <{wd_entity_uri}> wdt:P31 ?type .
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    wd_sparql.setQuery(query)
    results = wd_sparql.query().convert()
    return [(r["type"]["value"], r["typeLabel"]["value"]) for r in results["results"]["bindings"]]

# Example test
print(get_wikidata_types("http://www.wikidata.org/entity/Q1116546"))


[('http://www.wikidata.org/entity/Q16560', 'palace')]


In [47]:
def get_superclasses(wd_type_uri, depth=3):
    query = f"""
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wikibase: <http://wikiba.se/ontology#>
    PREFIX bd: <http://www.bigdata.com/rdf#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT DISTINCT ?super ?superLabel WHERE {{
      <{wd_type_uri}> (wdt:P279{{1,{depth}}}) ?super .
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    wd_sparql.setQuery(query)
    wd_sparql.setReturnFormat(JSON)
    results = wd_sparql.query().convert()
    return [(r["super"]["value"], r["superLabel"]["value"]) for r in results["results"]["bindings"]]

# Example: traverse 3 levels up from museum (Q33506)
print(get_superclasses("http://www.wikidata.org/entity/Q1116546"))


QueryBadFormed: QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b'SPARQL-QUERY: queryStr=\n    PREFIX wdt: <http://www.wikidata.org/prop/direct/>\n    PREFIX wikibase: <http://wikiba.se/ontology#>\n    PREFIX bd: <http://www.bigdata.com/rdf#>\n    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n\n    SELECT DISTINCT ?super ?superLabel WHERE {\n      <http://www.wikidata.org/entity/Q1116546> (wdt:P279{1,3}) ?super .\n      SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }\n    }\n    \njava.util.concurrent.ExecutionException: org.openrdf.query.MalformedQueryException: Encountered " "{" "{ "" at line 8, column 58.\nWas expecting one of:\n    ")" ...\n    "+" ...\n    "*" ...\n    "?" ...\n    "/" ...\n    "|" ...\n    \n\tat java.util.concurrent.FutureTask.report(FutureTask.java:122)\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:206)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:687)\n\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:790)\n\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:865)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1655)\n\tat org.wikidata.query.rdf.blazegraph.throttling.ThrottlingFilter.doFilter(ThrottlingFilter.java:322)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat org.wikidata.query.rdf.blazegraph.throttling.SystemOverloadFilter.doFilter(SystemOverloadFilter.java:84)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat ch.qos.logback.classic.helpers.MDCInsertingServletFilter.doFilter(MDCInsertingServletFilter.java:50)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat org.wikidata.query.rdf.blazegraph.filters.QueryEventSenderFilter.doFilter(QueryEventSenderFilter.java:125)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat org.wikidata.query.rdf.blazegraph.filters.ClientIPFilter.doFilter(ClientIPFilter.java:43)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat org.wikidata.query.rdf.blazegraph.filters.JWTIdentityFilter.doFilter(JWTIdentityFilter.java:66)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat org.wikidata.query.rdf.blazegraph.filters.RealAgentFilter.doFilter(RealAgentFilter.java:33)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1642)\n\tat org.wikidata.query.rdf.blazegraph.filters.RequestConcurrencyFilter.doFilter(RequestConcurrencyFilter.java:50)\n\tat org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1634)\n\tat org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:533)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:146)\n\tat org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:548)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:132)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:257)\n\tat org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:1595)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextHandle(ScopedHandler.java:255)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1340)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:203)\n\tat org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:473)\n\tat org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:1564)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:201)\n\tat org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1242)\n\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:144)\n\tat org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:220)\n\tat org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:126)\n\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:132)\n\tat org.eclipse.jetty.server.Server.handle(Server.java:503)\n\tat org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:364)\n\tat org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:260)\n\tat org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:305)\n\tat org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:103)\n\tat org.eclipse.jetty.io.ChannelEndPoint$2.run(ChannelEndPoint.java:118)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:333)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:310)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:168)\n\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:126)\n\tat org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:366)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:765)\n\tat org.eclipse.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:683)\n\tat java.lang.Thread.run(Thread.java:750)\nCaused by: org.openrdf.query.MalformedQueryException: Encountered " "{" "{ "" at line 8, column 58.\nWas expecting one of:\n    ")" ...\n    "+" ...\n    "*" ...\n    "?" ...\n    "/" ...\n    "|" ...\n    \n\tat com.bigdata.rdf.sail.sparql.Bigdata2ASTSPARQLParser.parseQuery2(Bigdata2ASTSPARQLParser.java:400)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet$SparqlQueryTask.call(QueryServlet.java:741)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet$SparqlQueryTask.call(QueryServlet.java:695)\n\tat com.bigdata.rdf.task.ApiTaskForIndexManager.call(ApiTaskForIndexManager.java:68)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\t... 1 more\nCaused by: com.bigdata.rdf.sail.sparql.ast.ParseException: Encountered " "{" "{ "" at line 8, column 58.\nWas expecting one of:\n    ")" ...\n    "+" ...\n    "*" ...\n    "?" ...\n    "/" ...\n    "|" ...\n    \n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.generateParseException(SyntaxTreeBuilder.java:9722)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.jj_consume_token(SyntaxTreeBuilder.java:9589)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.PathPrimary(SyntaxTreeBuilder.java:3245)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.PathElt(SyntaxTreeBuilder.java:3184)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.PathSequence(SyntaxTreeBuilder.java:3134)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.PathAlternative(SyntaxTreeBuilder.java:3093)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.Path(SyntaxTreeBuilder.java:3084)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.VerbPath(SyntaxTreeBuilder.java:3080)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.PropertyListPath(SyntaxTreeBuilder.java:2981)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.TriplesSameSubjectPath(SyntaxTreeBuilder.java:2919)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.TriplesBlock(SyntaxTreeBuilder.java:2312)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.BasicGraphPattern(SyntaxTreeBuilder.java:2097)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.GraphPattern(SyntaxTreeBuilder.java:2034)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.GroupGraphPattern(SyntaxTreeBuilder.java:1969)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.WhereClause(SyntaxTreeBuilder.java:1013)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.SelectQuery(SyntaxTreeBuilder.java:377)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.Query(SyntaxTreeBuilder.java:328)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.QueryContainer(SyntaxTreeBuilder.java:216)\n\tat com.bigdata.rdf.sail.sparql.ast.SyntaxTreeBuilder.parseQuery(SyntaxTreeBuilder.java:32)\n\tat com.bigdata.rdf.sail.sparql.Bigdata2ASTSPARQLParser.parseQuery2(Bigdata2ASTSPARQLParser.java:336)\n\t... 7 more\n'