In [3]:
from SPARQLWrapper import SPARQLWrapper, JSON
import re
import pandas as pd
from tqdm import tqdm
from rdflib import Graph, URIRef, Literal, Namespace, RDF, RDFS

# -------------------------------
# 1. Setup DBpedia SPARQL endpoint
# -------------------------------
DBPEDIA_SPARQL = "https://dbpedia.org/sparql"
sparql = SPARQLWrapper(DBPEDIA_SPARQL)
sparql.setReturnFormat(JSON)

# Namespace shortcuts
DCT = Namespace("http://purl.org/dc/terms/")
SKOS = Namespace("http://www.w3.org/2004/02/skos/core#")
EX = Namespace("http://example.org/kg/")  # for our custom triples


# -------------------------------
# 2. Function: Get categories matching "Visitor_attractions_in_*"
# -------------------------------
def get_visitor_attraction_categories(limit=10):
    """
    Retrieve categories that start with Category:Visitor_attractions_in_
    For demo, limit results.
    """
    query = f"""
    SELECT DISTINCT ?category WHERE {{
      ?category a skos:Concept .
      FILTER regex(str(?category), "^http://dbpedia.org/resource/Category:Tourist_attractions_in_")
    }} LIMIT {limit}
    """
    sparql.setQuery(query)
    results = sparql.query().convert()
    return [r['category']['value'] for r in results['results']['bindings']]


# -------------------------------
# 3. Function: Get POIs for a given category
# -------------------------------
def get_pois_for_category(category_uri):
    query = f"""
    SELECT DISTINCT ?POI ?category WHERE {{
      ?POI <http://purl.org/dc/terms/subject> ?category .
      ?category <http://www.w3.org/2004/02/skos/core#broader> <{category_uri}> .
    }}
    """
    sparql.setQuery(query)
    results = sparql.query().convert()
    pois = []
    for r in results['results']['bindings']:
        poi = r['POI']['value']
        cat = r['category']['value']
        pois.append((poi, cat))
    return pois


# -------------------------------
# 4. Regex-based syntactic verification
# -------------------------------
CATEGORY_REGEX = re.compile(r"http://dbpedia\.org/resource/Category:(.+?)_(in|of)_(.+)")

def parse_category_uri(category_uri):
    m = CATEGORY_REGEX.match(category_uri)
    if m:
        type_str = m.group(1).replace("_", " ")
        location_str = m.group(3).replace("_", " ")
        return type_str, location_str
    return None, None



In [4]:
# -------------------------------
# 5. Main pipeline
# -------------------------------
all_records = []

categories = get_visitor_attraction_categories(limit=100)  # adjust limit
print(f"Found {len(categories)} categories")
print(categories)


Found 100 categories
['http://dbpedia.org/resource/Category:Tourist_attractions_in_Amsterdam', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Anaheim,_California', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Bergen_County,_New_Jersey', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Caddo_County,_Oklahoma', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Canada', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_County_Donegal', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Cumberland_County,_Nova_Scotia', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Dakota_County,_Minnesota', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Detroit', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Duval_County,_Florida', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Fairfax_County,_Virginia', 'http://dbpedia.org/resource/Category:Tourist_attractions_in_Fremont_Coun

In [5]:

for cat_uri in tqdm(categories, desc="Processing categories"):
    pairs = get_pois_for_category(cat_uri)
    for poi_uri, category_uri in pairs:
        type_str, location_str = parse_category_uri(category_uri)
        if type_str and location_str:
            all_records.append({
                "POI": poi_uri,
                "Category": category_uri,
                "Type": type_str,
                "Location": location_str
            })

df = pd.DataFrame(all_records)

Processing categories: 100%|██████████| 100/100 [00:21<00:00,  4.63it/s]


In [6]:
print(df[df["Location"] == "Bergen County, New Jersey"].head())

                                                   POI  \
241        http://dbpedia.org/resource/Baylor_Massacre   
242  http://dbpedia.org/resource/Holy_Cross_Cemeter...   
243  http://dbpedia.org/resource/Riverside_Cemetery...   
244  http://dbpedia.org/resource/George_Washington_...   
245    http://dbpedia.org/resource/Gethsemane_Cemetery   

                                              Category        Type  \
241  http://dbpedia.org/resource/Category:Cemeterie...  Cemeteries   
242  http://dbpedia.org/resource/Category:Cemeterie...  Cemeteries   
243  http://dbpedia.org/resource/Category:Cemeterie...  Cemeteries   
244  http://dbpedia.org/resource/Category:Cemeterie...  Cemeteries   
245  http://dbpedia.org/resource/Category:Cemeterie...  Cemeteries   

                      Location  
241  Bergen County, New Jersey  
242  Bergen County, New Jersey  
243  Bergen County, New Jersey  
244  Bergen County, New Jersey  
245  Bergen County, New Jersey  


In [7]:

# -------------------------------
# 6. Build a mini Knowledge Graph (RDF) locally
# -------------------------------
g = Graph()
g.bind("ex", EX)
g.bind("rdfs", RDFS)

for _, row in df.iterrows():
    poi_uri = URIRef(row["POI"])
    type_uri = EX[row["Type"].replace(" ", "_")]
    location_uri = EX[row["Location"].replace(" ", "_")]

    # Store relationships
    g.add((poi_uri, RDF.type, type_uri))
    g.add((poi_uri, EX.hasLocation, location_uri))

    # Add hierarchy (type subclassOf Visitor_attractions)
    g.add((type_uri, RDFS.subClassOf, EX.Visitor_attractions))

print(f"\nGenerated RDF triples: {len(g)}")

# Save graph to file
g.serialize(destination="visitor_attractions_kg.ttl", format="turtle")
print("Graph saved to visitor_attractions_kg.ttl")



Generated RDF triples: 10051
Graph saved to visitor_attractions_kg.ttl


In [8]:
g.parse("visitor_attractions_kg.ttl", format="turtle")
# Print a few example triples
print(f"Total triples: {len(g)}\n")
for i, triple in enumerate(g):
    print(triple)
    if i == 9:  # show first 10 triples only
        break


Total triples: 10051

(rdflib.term.URIRef("http://dbpedia.org/resource/St_Thomas'_Church,_Prague"), rdflib.term.URIRef('http://example.org/kg/hasLocation'), rdflib.term.URIRef('http://example.org/kg/Prague'))
(rdflib.term.URIRef('http://dbpedia.org/resource/Corum_(Montpellier)'), rdflib.term.URIRef('http://example.org/kg/hasLocation'), rdflib.term.URIRef('http://example.org/kg/Montpellier'))
(rdflib.term.URIRef('http://dbpedia.org/resource/NRCA_Stadium'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://example.org/kg/Sports_venues'))
(rdflib.term.URIRef('http://dbpedia.org/resource/Herndon_ArtSpace'), rdflib.term.URIRef('http://example.org/kg/hasLocation'), rdflib.term.URIRef('http://example.org/kg/Fairfax_County,_Virginia'))
(rdflib.term.URIRef('http://dbpedia.org/resource/Bird_Kingdom'), rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), rdflib.term.URIRef('http://example.org/kg/Aviaries'))
(rdflib.term.URIRef('htt

In [9]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

DBPEDIA_SPARQL = "https://dbpedia.org/sparql"
sparql = SPARQLWrapper(DBPEDIA_SPARQL)
sparql.setReturnFormat(JSON)

# Example: Get mappings for all POIs we already discovered
def get_wikidata_mapping(poi_uri):
    query = f"""
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    SELECT DISTINCT ?wikidata WHERE {{
      <{poi_uri}> owl:sameAs ?wikidata .
      FILTER(STRSTARTS(STR(?wikidata), "http://www.wikidata.org/entity/"))
    }}
    """
    sparql.setQuery(query)
    results = sparql.query().convert()
    return [r["wikidata"]["value"] for r in results["results"]["bindings"]]

# Example test
poi = "http://dbpedia.org/resource/Palau_Reial_Major"
print(get_wikidata_mapping(poi))


['http://www.wikidata.org/entity/Q1116546']


In [10]:
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
wd_sparql = SPARQLWrapper(WIKIDATA_SPARQL)
wd_sparql.setReturnFormat(JSON)

def get_wikidata_types(wd_entity_uri):
    query = f"""
    SELECT DISTINCT ?type ?typeLabel WHERE {{
      <{wd_entity_uri}> wdt:P31 ?type .
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    wd_sparql.setQuery(query)
    results = wd_sparql.query().convert()
    return [(r["type"]["value"], r["typeLabel"]["value"]) for r in results["results"]["bindings"]]

# Example test
print(get_wikidata_types("http://www.wikidata.org/entity/Q1116546"))


[('http://www.wikidata.org/entity/Q16560', 'palace')]


In [11]:
def get_superclasses(wd_type_uri):
    query = f"""
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wikibase: <http://wikiba.se/ontology#>
    PREFIX bd: <http://www.bigdata.com/rdf#>

    SELECT DISTINCT ?super ?superLabel WHERE {{
      <{wd_type_uri}> (wdt:P279 | wdt:P279/wdt:P279 | wdt:P279/wdt:P279/wdt:P279) ?super .
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    wd_sparql.setQuery(query)
    wd_sparql.setReturnFormat(JSON)
    results = wd_sparql.query().convert()
    return [(r["super"]["value"], r["superLabel"]["value"]) for r in results["results"]["bindings"]]


# Example: traverse 3 levels up from museum (Q33506)
print(get_superclasses("http://www.wikidata.org/entity/Q16560"))


[('http://www.wikidata.org/entity/Q386724', 'work'), ('http://www.wikidata.org/entity/Q989946', 'shelter'), ('http://www.wikidata.org/entity/Q811979', 'architectural structure'), ('http://www.wikidata.org/entity/Q618123', 'geographical feature'), ('http://www.wikidata.org/entity/Q11755880', 'residential building'), ('http://www.wikidata.org/entity/Q134301417', 'E25 Human-Made Feature'), ('http://www.wikidata.org/entity/Q811430', 'fixed construction'), ('http://www.wikidata.org/entity/Q1403389', 'residential property'), ('http://www.wikidata.org/entity/Q11908691', 'artificial physical structure'), ('http://www.wikidata.org/entity/Q8205328', 'artificial physical object'), ('http://www.wikidata.org/entity/Q15710813', 'physical structure'), ('http://www.wikidata.org/entity/Q16686448', 'artificial object'), ('http://www.wikidata.org/entity/Q3947', 'house'), ('http://www.wikidata.org/entity/Q41176', 'building'), ('http://www.wikidata.org/entity/Q108147624', 'refuge')]


In [12]:
def to_dbpedia_uri(location_string):
    # Replace spaces with underscores and prepend DBpedia prefix
    clean = location_string.strip().replace(" ", "_")
    return f"http://dbpedia.org/resource/{clean}"

to_dbpedia_uri("Barcelona")
# → http://dbpedia.org/resource/Barcelona


'http://dbpedia.org/resource/Barcelona'

In [13]:
DBPEDIA_SPARQL = "https://dbpedia.org/sparql"
sparql = SPARQLWrapper(DBPEDIA_SPARQL)
sparql.setReturnFormat(JSON)

def resolve_redirect(uri):
    query = f"""
    PREFIX dbo: <http://dbpedia.org/ontology/>
    SELECT ?target WHERE {{
      <{uri}> dbo:wikiPageRedirects ?target .
    }}
    """
    sparql.setQuery(query)
    results = sparql.query().convert()
    bindings = results["results"]["bindings"]
    if bindings:
        return bindings[0]["target"]["value"]  # redirected URI
    return uri  # no redirect found

# Example:
print(resolve_redirect("http://dbpedia.org/resource/Chicago,_Illinois"))
# → http://dbpedia.org/resource/Chicago


http://dbpedia.org/resource/Chicago


In [14]:
def get_sameas_links(dbpedia_uri):
    query = f"""
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    SELECT DISTINCT ?same WHERE {{
      <{dbpedia_uri}> owl:sameAs ?same .
    }}
    """
    sparql.setQuery(query)
    results = sparql.query().convert()
    return [b["same"]["value"] for b in results["results"]["bindings"]]

# Example
links = get_sameas_links("http://dbpedia.org/resource/Barcelona")
for l in links:
    print(l)


http://lt.dbpedia.org/resource/Barselona
http://rdf.freebase.com/ns/m.01f62
http://linkedgeodata.org/triplify/node152364165
http://viaf.org/viaf/124213732
http://api.nytimes.com/svc/semantic/v2/concept/name/nytd_geo/Barcelona%20(Spain)
http://linked-web-apis.fit.cvut.cz/resource/barcelona_city
http://sw.cyc.com/concept/Mx4rvVkLC5wpEbGdrcN5Y29ycA
http://sws.geonames.org/3128760/
http://yago-knowledge.org/resource/Barcelona
http://d-nb.info/gnd/4004503-1
http://commons.dbpedia.org/resource/Barcelona
http://d-nb.info/gnd/8720-8
http://musicbrainz.org/area/12c3b82e-fcab-4219-9bd5-792089d8280e
http://viaf.org/viaf/241408827
http://www.wikidata.org/entity/Q103145688
http://www.wikidata.org/entity/Q1492
http://sws.geonames.org/6356055/
http://af.dbpedia.org/resource/Barcelona
http://als.dbpedia.org/resource/Barcelona
http://am.dbpedia.org/resource/ባርሴሎና
http://an.dbpedia.org/resource/Barcelona
http://ar.dbpedia.org/resource/برشلونة
http://arz.dbpedia.org/resource/بارسلونا
http://ast.dbpedia.o

In [15]:
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
wd_sparql = SPARQLWrapper(WIKIDATA_SPARQL)
wd_sparql.setReturnFormat(JSON)

def is_city_wikidata(wd_uri):
    query = f"""
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX wd: <http://www.wikidata.org/entity/>
    ASK {{
      <{wd_uri}> (wdt:P31/wdt:P279*) wd:Q515 .
    }}
    """
    wd_sparql.setQuery(query)
    return wd_sparql.query().convert()["boolean"]

# Example
print(is_city_wikidata("http://www.wikidata.org/entity/Q1492"))  # Barcelona
# → True


True


In [16]:
def is_city_geonames(geonames_uri, cities1000_path="cities1000.txt"):
    geoname_id = geonames_uri.split("/")[-2]
    with open(cities1000_path, encoding="utf-8") as f:
        for line in f:
            if line.startswith(geoname_id + "\t"):
                return True
    return False


In [17]:
def link_poi_to_city(location_string):
    db_uri = to_dbpedia_uri(location_string)
    resolved = resolve_redirect(db_uri)
    links = get_sameas_links(resolved)

    wd_links = [l for l in links if "wikidata.org/entity" in l]
    geo_links = [l for l in links if "geonames.org" in l]

    is_city = False
    if wd_links:
        is_city = is_city_wikidata(wd_links[0])
    elif geo_links:
        is_city = is_city_geonames(geo_links[0])

    return {
        "location_string": location_string,
        "dbpedia_uri": resolved,
        "wikidata_uri": wd_links[0] if wd_links else None,
        "geonames_uri": geo_links[0] if geo_links else None,
        "is_city": is_city
    }

print(link_poi_to_city("Chicago,_Illinois"))


{'location_string': 'Chicago,_Illinois', 'dbpedia_uri': 'http://dbpedia.org/resource/Chicago', 'wikidata_uri': 'http://www.wikidata.org/entity/Q1297', 'geonames_uri': 'http://sws.geonames.org/4887398/', 'is_city': True}


In [19]:
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, OWL, XSD

def create_poi_city_triples(poi_uri, type_string, location_info):
    g = Graph()

    EX = Namespace("http://example.org/ontology/")
    DBR = Namespace("http://dbpedia.org/resource/")
    WD = Namespace("http://www.wikidata.org/entity/")
    GEO = Namespace("http://sws.geonames.org/")
    
    poi = URIRef(poi_uri)
    city = URIRef(location_info["dbpedia_uri"])

    # POI triples
    g.add((poi, RDF.type, EX[type_string]))
    g.add((poi, EX.locatedIn, city))
    g.add((poi, EX.typeString, Literal(type_string)))
    g.add((poi, EX.locationString, Literal(location_info["location_string"])))

    # City triples
    g.add((city, RDF.type, EX.City))
    if location_info.get("wikidata_uri"):
        g.add((city, OWL.sameAs, URIRef(location_info["wikidata_uri"])))
    if location_info.get("geonames_uri"):
        g.add((city, OWL.sameAs, URIRef(location_info["geonames_uri"])))
    g.add((city, EX.isVerifiedCity, Literal(location_info["is_city"], datatype=XSD.boolean)))

    return g

# Example usage:
location_info = {
    "location_string": "Barcelona",
    "dbpedia_uri": "http://dbpedia.org/resource/Barcelona",
    "wikidata_uri": "http://www.wikidata.org/entity/Q1492",
    "geonames_uri": "http://sws.geonames.org/3128760/",
    "is_city": True
}

g = create_poi_city_triples("http://dbpedia.org/resource/Palau_Reial_Major", "Palaces", location_info)
print(g.serialize(format="turtle"))


@prefix ns1: <http://example.org/ontology/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<http://dbpedia.org/resource/Palau_Reial_Major> a ns1:Palaces ;
    ns1:locatedIn <http://dbpedia.org/resource/Barcelona> ;
    ns1:locationString "Barcelona" ;
    ns1:typeString "Palaces" .

<http://dbpedia.org/resource/Barcelona> a ns1:City ;
    ns1:isVerifiedCity true ;
    owl:sameAs <http://sws.geonames.org/3128760/>,
        <http://www.wikidata.org/entity/Q1492> .


