Trim "resource" URIs per (subject, predicate) in a Turtle file.

CONFIG (edit below):
- INPUT_TTL: path to your TTL file
- LIMIT: max number of URI objects containing NEEDLE to keep per (s, p)
- NEEDLE: substring to match inside URI objects
- MAKE_BACKUP: if True, writes INPUT_TTL + ".bak" before replacing in place

Requires:
    pip install rdflib

In [1]:
from rdflib import Graph, URIRef
import shutil
import os
from pathlib import Path
from collections import defaultdict

In [2]:
global_graph = None
removed_count = 0

def trim_resource_links_inplace(
    ttl_path: Path, limit: int = 2, needle: str = "resource") -> None:
    
    global global_graph
    global removed_count

    g = global_graph if global_graph else Graph()
    if global_graph:
        print("Graph Exists")
    else:
        g.parse(str(ttl_path), format="turtle")
    
    print("Graph loaded")

    hits = defaultdict(list)  # (s, p) -> [o, ...]
    for s, p, o in g:
        if isinstance(o, URIRef) and needle in str(o):
            hits[(s, p)].append(o)
    print("Hits detected")

    for (s, p), objs in hits.items():
        if len(objs) > limit:
            # deterministic: keep the first `limit` by lexical order
            objs_sorted = sorted(objs, key=lambda u: str(u))
            keep = set(objs_sorted[:limit])
            for o in objs:
                if o not in keep:
                    g.remove((s, p, o))
                    removed_count += 1
    print("excess items removed")

    global_graph = g

In [3]:
def save_to_disk(ttl_path, make_backup: bool = True):
    global removed_count
    global global_graph

    g = global_graph
    tmp_path = ttl_path.with_suffix(ttl_path.suffix + ".tmp")

    # Serialize to a temp file first
    g.serialize(destination=str(tmp_path), format="turtle")
    
    print("temp ttl serialized")

    # Optional backup
    if make_backup:
        bak_path = ttl_path.with_suffix(ttl_path.suffix + ".bak")
        shutil.copyfile(ttl_path, bak_path)

    print("backup created")

    # Atomic replace
    os.replace(tmp_path, ttl_path)

    print(
        f"Done. Removed {removed_count} triple(s). "
        f"Updated file written in place to: {ttl_path.name}"
        + (f" (backup at {ttl_path.name}.bak)" if make_backup else "")
    )

In [4]:
# -------------------- CONFIG --------------------
INPUT_TTL = "big_one-test.ttl"   # path to your TTL file
LIMIT = 2             # keep at most this many matching URIs per (s, p)
NEEDLE = "resource"    # substring to match in URIRef object strings
MAKE_BACKUP = True          # create input.ttl.bak before replacing
# ------------------------------------------------

In [5]:
trim_resource_links_inplace(
    Path(INPUT_TTL),
    limit=LIMIT,
    needle=NEEDLE,
    # make_backup=MAKE_BACKUP,
)

Graph loaded
Hits detected
excess items removed


In [6]:
NEEDLE_2 = "Special:FilePath"    # substring to match in URIRef object strings

In [7]:
trim_resource_links_inplace(
    Path(INPUT_TTL),
    limit=LIMIT,
    needle=NEEDLE_2,
    # make_backup=MAKE_BACKUP,
)

Graph Exists
Graph loaded
Hits detected
excess items removed


In [8]:
NEEDLE_3 = "rdf:langString"    # substring to match in URIRef object strings

In [9]:
trim_resource_links_inplace(
    Path(INPUT_TTL),
    limit=LIMIT,
    needle=NEEDLE_3,
    # make_backup=MAKE_BACKUP,
)

Graph Exists
Graph loaded
Hits detected
excess items removed


In [10]:
removed_count

2188208

In [11]:
save_to_disk(
    Path(INPUT_TTL),
    make_backup=MAKE_BACKUP
)

temp ttl serialized
backup created
Done. Removed 2188208 triple(s). Updated file written in place to: big_one-test.ttl (backup at big_one-test.ttl.bak)
