In [1]:
pip install --upgrade owlready2 wikibaseintegrator python-dotenv tqdm

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, python-dotenv
[2K  Attempting uninstall: tqdm
[2K    Found existing installation: tqdm 4.66.6
[2K    Uninstalling tqdm-4.66.6:
[2K      Successfully uninstalled tqdm-4.66.6
[2K  Attempting uninstall: python-dotenv
[2K    Found existing installation: python-dotenv 1.1.0
[2K    Uninstalling python-dotenv-1.1.0:
[2K      Successfully uninstalled python-dotenv-1.1.0
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [python-dotenv]
[1A[2KSuccessfully installed python-dotenv-1.1.1 tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
# phase0_scaffold.py ----------------------------------------------------------
from dotenv import load_dotenv
import owlready2 as owl
from wikibaseintegrator import wbi_login, WikibaseIntegrator
import os, pathlib
from wikibaseintegrator.wbi_config import config

# ── Globals ────────────────────────────────────────────────────────────────
MEDIAWIKI_API_URL   = "https://everything4everyone.wikibase.cloud/w/api.php"
WIKIBASE_SPARQL_URL = "https://everything4everyone.wikibase.cloud/query/sparql"
ONTOLOGY_FILE       = "slr_reviewed.owl"

# ── read environment.env ───────────────────────────────────────────────────
load_dotenv("environment.env")
user = os.getenv("WIKIBASE_USER")
pw   = os.getenv("WIKIBASE_PASSWORD")

# ── Fix WBI’s global config  (must match the login URL)  ───────────────────
config['MEDIAWIKI_API_URL']   = MEDIAWIKI_API_URL
config['SPARQL_ENDPOINT_URL'] = WIKIBASE_SPARQL_URL
config['USER_AGENT']          = "SLR-OntologyImporter/1.0 (you@example.org)"

# ── 1. Login test ──────────────────────────────────────────────────────────
login = wbi_login.Login(
    user=user,
    password=pw
    # mediawiki_api_url argument is optional now; config already has it
)
wbi = WikibaseIntegrator(login=login)
print("✓ Connected — edit token:", login.get_edit_token()[:8], "…")

# ── 2. Ontology parse test ────────────────────────────────────────────────
onto_path = pathlib.Path(ONTOLOGY_FILE).expanduser().resolve()   # ★ make absolute
if not onto_path.is_file():
    raise FileNotFoundError(f"{onto_path} not found")

onto = owl.get_ontology(onto_path.as_uri()).load()
triples = len(list(onto.world.as_rdflib_graph()))
print(f"✓ Ontology parsed — {triples:,} triples loaded")



✓ Connected — edit token: c9ddf179 …


wptmp:entity#data importing does not look like a valid URI, trying to serialize this will break.
wptmp:entity#data importing does not look like a valid URI, trying to serialize this will break.
wptmp:entity#export data does not look like a valid URI, trying to serialize this will break.
wptmp:entity#export data does not look like a valid URI, trying to serialize this will break.
wptmp:entity#export data does not look like a valid URI, trying to serialize this will break.


✓ Ontology parsed — 14,561 triples loaded


In [2]:
from collections import Counter

# Gather elements
classes        = list(onto.classes())
obj_properties = list(onto.object_properties())
data_properties= list(onto.data_properties())
anno_properties= list(onto.annotation_properties())
individuals    = list(onto.individuals())          # ignores <rdf:Description>, as planned

# Quick stats ---------------------------------------------------------------
stats = Counter({
    "Classes"           : len(classes),
    "Object properties" : len(obj_properties),
    "Data properties"   : len(data_properties),
    "Annotation props"  : len(anno_properties),
    "Individuals"       : len(individuals),
})

print("\n── summary ─────────────────────")
for k, v in stats.items():
    print(f"{k:<18} {v:>6}")


── summary ─────────────────────
Classes                 7
Object properties      13
Data properties         0
Annotation props        7
Individuals           956


In [3]:
# Optional: peek at the first 3 labels of each type -------------------------
def peek(lst, n=3):
    return [ (elt.label.first() or elt.name) for elt in lst[:n] ]

print("\nSample labels:")
print("  Classes           :", peek(classes))
print("  Obj-properties    :", peek(obj_properties))
print("  Data-properties   :", peek(data_properties))
print("  Annotation props  :", peek(anno_properties))
print("  Individuals       :", peek(individuals))
print("────────────────────────────────────────\n")


Sample labels:
  Classes           : ['Contribution', 'process', 'data item']
  Obj-properties    : ['has data format specification', 'has data item', 'has data model']
  Data-properties   : []
  Annotation props  : ['description', 'alias', 'source']
  Individuals       : ['Struck et al. - 1984 - EVALUATION OF OPERATIONAL LOACS TO VERIFY STRUCTUR', 'flight test', 'information search']
────────────────────────────────────────



In [22]:
         # ← run once in dry mode, flip later
DRY_RUN             =  False
SLEEP_BETWEEN_WRITES= 0.5
ONTOLOGY_CLASS_QID     = "Q1"        # item that means “Ontology class”
CLASS_MEMBERSHIP_PID   = "P1"
PROPERTY_MAP_FILE   = "property_map.json"

ANNOTATION_HANDLERS = {
    "description"  : "builtin-desc",    # Dublin Core description
    "alias"        : "builtin-alias",
    "plural"       : "builtin-alias",
    "instance of"  : "ignore",          # we already add P1→class
    "source"       : ("string",      "Source for this entity"),
    "wikidata_uri" : ("external-id","Matching Wikidata entity"),
    "orkg_id"      : ("external-id","Open Research Knowledge Graph ID"),
}

# fallback for any other annotation prop we decide to turn into a property
DEFAULT_ANNOTATION_DATATYPE = ("string", "Imported annotation")

In [15]:
from functools import cache
from wikibaseintegrator import wbi_helpers
import time
import os, pathlib, json
from wikibaseintegrator.wbi_exceptions import MissingEntityException

@cache
def search_entity(label: str, etype: str = "item"):
    """Return first QID/PID with the exact English label, or None."""
    hits = wbi_helpers.search_entities(
        label, language="en", search_type=etype,
        max_results=1, dict_result=True
    )
    return hits[0]["id"] if hits else None



def upsert_property(label, description, datatype):
    if DRY_RUN:
        return f"DRY_{label}"

    pid = search_entity(label, "property")
    try:
        prop = wbi.property.get(pid) if pid else wbi.property.new(datatype=datatype)
    except MissingEntityException:
        # search cache pointed to a deleted property; create a fresh one
        prop = wbi.property.new(datatype=datatype)

    prop.labels.set("en", label)
    prop.descriptions.set("en", description or "")
    pid = prop.write(login=login).id
    time.sleep(SLEEP_BETWEEN_WRITES)
    return pid


def upsert_item(label: str, description: str | None, aliases: list[str] | None,
                claims: list) -> str:
    """Create/update a Wikibase item and return its QID (DRY-RUN safe)."""
    if DRY_RUN:
        qid = f"DRY_{label}"
        print(f"[DRY] Item: {label} → {qid}")
        return qid

    qid = search_entity(label, "item")
    item = wbi.item.get(qid) if qid else wbi.item.new()
    item.labels.set("en", label)
    if description:
        item.descriptions.set("en", description)
    if aliases:
        item.aliases.set("en", aliases)

    for c in claims:
        item.claims.add(c)

    qid = item.write(login=login).id
    time.sleep(SLEEP_BETWEEN_WRITES)
    return qid

In [16]:
# ────────────────────────────────────────────────────────────────────────────
# Phase 2 : create / update properties  (DRY_RUN safe)
# ────────────────────────────────────────────────────────────────────────────
from wikibaseintegrator.datatypes import URL, ExternalID, String


property_map = {}                        # label → PID (or DRY_label)

def ensure_property(label, datatype, descr="Imported from OWL") -> str:
    """Create (or fetch) a Wikibase property and remember its PID."""
    if DRY_RUN:
        pid = f"DRY_{label}"
        print(f"[DRY] Property({datatype}): {label}  → {pid}")
    else:
        pid = upsert_property(label, descr, datatype)
    property_map[label] = pid
    return pid

print("\n→ Phase 2: Wikibase properties")
# 2-1 Object properties
for prop in obj_properties:
    lbl = prop.label.first() or prop.name
    ensure_property(lbl, "wikibase-item")

# 2-2 Data properties (none in current ontology but loop stays)
for prop in data_properties:
    lbl = prop.label.first() or prop.name
    ensure_property(lbl, "string")

# 2-3 Annotation properties that need their own PID
for ap in anno_properties:
    lbl = (ap.label.first() or ap.name).strip().lower()
    handler = ANNOTATION_HANDLERS.get(lbl)
    if isinstance(handler, tuple):           # ('url', 'Source …') etc.
        dtype, descr = handler
        ensure_property(lbl, dtype, descr)

print(f"✓ Properties prepared ({len(property_map)} total)")

# 2-4 Persist the map unless DRY_RUN
if DRY_RUN:
    print("[DRY] property_map.json NOT written")
else:
    with open(PROPERTY_MAP_FILE, "w") as fh:
        json.dump(property_map, fh, indent=2)
    print(f"✓ Saved property map → {PROPERTY_MAP_FILE}")



→ Phase 2: Wikibase properties
✓ Properties prepared (16 total)
✓ Saved property map → property_map.json


In [17]:
with open(PROPERTY_MAP_FILE) as fh:
    property_map = json.load(fh)

In [18]:
# ────────────────────────────────────────────────────────────────────────────
# Phase 3 : create / update class items
# ────────────────────────────────────────────────────────────────────────────
from wikibaseintegrator.datatypes import Item as WBItem
iri_to_qid = {}                         # will also hold individuals later

print("\n→ Phase 3: class items")
for cls in classes:
    label = cls.label.first() or cls.name
    descr = cls.comment.first()

    qid = upsert_item(
        label=label,
        description=descr,
        aliases=None,
        claims=[WBItem(prop_nr=CLASS_MEMBERSHIP_PID, value=ONTOLOGY_CLASS_QID)]
    )
    iri_to_qid[cls.iri] = qid

print(f"✓ Classes imported ({len(iri_to_qid)} items)")



→ Phase 3: class items
✓ Classes imported (7 items)


In [19]:
# ────────────────────────────────────────────────────────────────────────────
# Phase 4 : individuals – basic data & annotations
# ────────────────────────────────────────────────────────────────────────────
from collections import defaultdict
from wikibaseintegrator.datatypes import String, URL, ExternalID

def extract_annotations(entity):
    """Return (description, aliases, extra_claims) from annotation props."""
    desc    = None
    aliases = []
    claims  = []

    for ap in anno_properties:
        lbl = (ap.label.first() or ap.name).strip().lower()
        values = list(getattr(entity, ap.name, []))
        if not values:
            continue

        handler = ANNOTATION_HANDLERS.get(lbl)
        if handler == "builtin-desc":
            desc = values[0]
        elif handler == "builtin-alias":
            aliases.extend(values)
        elif isinstance(handler, tuple):          # we made a property for it
            pid = property_map.get(lbl)
            dtype, _ = handler
            for v in values:
                if dtype == "url":
                    claims.append(URL(value=v, prop_nr=pid))
                elif dtype == "external-id":
                    claims.append(ExternalID(value=v, prop_nr=pid))
                else:  # string
                    claims.append(String(value=v, prop_nr=pid))
    return desc, aliases, claims

print("\n→ Phase 4: individual items")
for ind in individuals:
    label = ind.label.first() or ind.name
    class_qid = iri_to_qid.get(ind.is_a[0].iri) if ind.is_a else None

    desc, aliases, extra_claims = extract_annotations(ind)

    claims = extra_claims
    if class_qid:
        claims.append(WBItem(prop_nr=CLASS_MEMBERSHIP_PID, value=class_qid))

    qid = upsert_item(
        label=label,
        description=desc,
        aliases=aliases,
        claims=claims
    )
    iri_to_qid[ind.iri] = qid

print(f"✓ Individuals imported ({len(individuals)} items)")



→ Phase 4: individual items


2025-07-07 07:04:30.671567: maxlag. sleeping for 6 seconds
2025-07-07 07:05:14.218000: maxlag. sleeping for 6 seconds
Connection error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')). Sleeping for 60 seconds.
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/urllib3/connectionpool.py", line 793, in urlopen
    response = self._make_request(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/urllib3/connectionpool.py", line 537, in _make_request
    response = conn.getresponse()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/urllib3/connection.py", line 466, in getresponse
    httplib_response = super().getresponse()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/http/client.py", line 1368, in getresponse
    response.begin()
  File "/Library/Frameworks/Python.framew

✓ Individuals imported (956 items)


In [23]:
# ────────────────────────────────────────────────────────────────────────────
# Phase 5 : object-property links  (hasDataItem, mentions, …)
# ────────────────────────────────────────────────────────────────────────────
print("\n→ Phase 5: object-property links")
links_added      = 0
links_skipped    = 0

for ind in individuals:
    subj_qid = iri_to_qid.get(ind.iri)
    if not subj_qid:
        continue                               # shouldn’t happen

    # get the live item only once (skip in dry-run)
    entity = None if DRY_RUN else wbi.item.get(subj_qid)

    for prop in obj_properties:
        lbl = prop.label.first() or prop.name
        pid = property_map.get(lbl)
        if not pid:
            continue                           # sanity: property missing

        for tgt in getattr(ind, prop.name, []):
            tgt_qid = iri_to_qid.get(tgt.iri)
            if not tgt_qid:
                links_skipped += 1             # external -> skip
                continue

            if DRY_RUN:
                print(f"[DRY] {subj_qid} --{lbl}/{pid}--> {tgt_qid}")
            else:
                entity.claims.add(WBItem(value=tgt_qid, prop_nr=pid))
                links_added += 1

    # write once per subject
    if not DRY_RUN and entity.claims:
        entity.write(login=login)
        time.sleep(SLEEP_BETWEEN_WRITES)

print(f"✓ Links added   : {links_added:,}")
print(f"• External targets skipped: {links_skipped:,}\n")



→ Phase 5: object-property links
✓ Links added   : 8,334
• External targets skipped: 0

