Generalized selector + ontology/usage coverage merger for RDF/XML → TTL.

Pass 1: Selection
  - Include ALL files whose tokens contain 'class' or 'property' (only hardcoding).
  - For every other auto-detected category (by branching), include at least N_PER_CATEGORY files.

Pass 2: Coverage
  - Ensure every defined predicate (rdf:Property, owl:ObjectProperty, owl:DatatypeProperty, owl:AnnotationProperty)
    appears in at least one triple (as a predicate).
  - Ensure every defined class (rdfs:Class, owl:Class) appears as an rdf:type object at least once.
  - Iteratively add files that contribute missing coverage from the remaining pool (up to caps).

Requires:
    pip install rdflib

In [None]:
import os
import glob
import random
from pathlib import Path
from collections import defaultdict
from rdflib import Graph, URIRef
from rdflib.namespace import RDF, RDFS, OWL

# ================== CONFIG ======================
INPUT_DIR          = "datasets/big_one"   # folder containing your .xml files
FILE_GLOB_PATTERN  = "*.xml"

# Selection (Pass 1)
N_PER_CATEGORY     = 1           # include at least this many per non-class/property category
RANDOMIZE_PICK     = False       # True => random sample inside each category
RANDOM_SEED        = 42
VERBOSE            = True

# Coverage (Pass 2)
ENSURE_RELATION_COVERAGE      = True   # ensure every defined predicate is used at least once
ENSURE_CLASS_INSTANCE_COVERAGE = True  # ensure every defined class has at least one instance (rdf:type)
MAX_SECOND_PASS_FILES          = 1000  # hard cap on additional files to try
STOP_WHEN_FULLY_COVERED        = True  # stop early as soon as coverage is complete
MIN_EXAMPLES_PER_RELATION = 1   # at least this many examples for each predicate/class

# Output
OUTPUT_TTL = f"{Path(INPUT_DIR).name}_SamCat{N_PER_CATEGORY}_Covered.ttl"
# ===============================================

In [45]:
SPECIAL_INCLUDE_TOKENS = {"class", "property"}  # only hardcoding per your constraints


# ---------- Helpers: filename tokenization & selection ----------

def split_tokens(path: Path) -> list[str]:
    """Hyphen-tokenize the filename stem (lowercased)."""
    return path.stem.lower().split("-")

def collect_files(input_dir: str, pattern: str) -> list[Path]:
    paths = [Path(p) for p in glob.glob(str(Path(input_dir) / pattern))]
    return [p for p in paths if p.is_file()]

def build_branching_scores(tokenized_files: dict[Path, list[str]]) -> dict[str, int]:
    """
    For every occurrence of token t at position i, record the next token (i+1).
    branching[t] = number of distinct next tokens globally.
    """
    nxts: dict[str, set[str]] = defaultdict(set)
    for toks in tokenized_files.values():
        for i, t in enumerate(toks[:-1]):
            nxts[t].add(toks[i+1])
    return {t: len(s) for t, s in nxts.items()}

def choose_category_tokens(branching: dict[str, int]) -> set[str]:
    """
    Adaptive threshold: tokens with branching >= max(Q3, FLOOR).
    """
    if not branching:
        return set()
    scores = sorted(branching.values())
    q3_idx = max(0, int(0.75 * (len(scores) - 1)))
    q3 = scores[q3_idx]
    FLOOR = 5
    thr = max(q3, FLOOR)
    return {t for t, sc in branching.items() if sc >= thr}

def detect_category_for_file(tokens: list[str], category_tokens: set[str]) -> str:
    for t in tokens:
        if t in category_tokens:
            return t
    return "root"

def select_files_pass1(files: list[Path]) -> list[Path]:
    """Pass 1: selection based on branching categories, with 'class'/'property' fully included."""
    if not files:
        return []

    tokenized = {f: split_tokens(f) for f in files}
    branching = build_branching_scores(tokenized)
    category_tokens = choose_category_tokens(branching)

    # Include ALL files containing 'class' or 'property' in their tokens
    specials = [f for f, toks in tokenized.items() if any(t in SPECIAL_INCLUDE_TOKENS for t in toks)]
    specials.sort(key=lambda p: p.name)

    # Bucket the rest by detected category
    buckets: dict[str, list[Path]] = defaultdict(list)
    for f, toks in tokenized.items():
        if f in specials:
            continue
        cat = detect_category_for_file(toks, category_tokens)
        buckets[cat].append(f)

    # Select at least N_PER_CATEGORY from each non-special category
    picks: list[Path] = []
    for cat, flist in buckets.items():
        if not flist:
            continue
        if RANDOMIZE_PICK:
            rnd = random.Random(RANDOM_SEED)
            pool = flist[:]
            rnd.shuffle(pool)
        else:
            pool = sorted(flist, key=lambda p: p.name)
        need = N_PER_CATEGORY if len(pool) >= N_PER_CATEGORY else len(pool)
        picks.extend(pool[:need])

    # Combine & de-dup (preserve order)
    combined = specials + picks
    seen = set()
    unique = []
    for p in combined:
        if p not in seen:
            seen.add(p)
            unique.append(p)

    if VERBOSE:
        print("Pass 1 selection complete.")
        # Simple readout:
        by_cat = defaultdict(int)
        # Recompute categories for reporting only:
        for f in unique:
            by_cat[detect_category_for_file(split_tokens(f), category_tokens)] += 1
        print("Selected per category (incl. class/property):")
        for c in sorted(by_cat):
            print(f"  - {c}: {by_cat[c]}")
        print(f"Total selected: {len(unique)}")

    return unique


# ---------- Helpers: graph analysis for coverage ----------

def defined_predicates(g: Graph) -> set[URIRef]:
    """All predicates declared as properties in the graph."""
    props = set()
    for p_type in (RDF.Property, OWL.ObjectProperty, OWL.DatatypeProperty, OWL.AnnotationProperty):
        for s in g.subjects(RDF.type, p_type):
            if isinstance(s, URIRef):
                props.add(s)
    return props

def used_predicates(g: Graph) -> set[URIRef]:
    """All predicates that actually occur in (s, p, o) triples (excluding rdf:type if you like)."""
    preds = set()
    for _, p, _ in g:
        if isinstance(p, URIRef):
            preds.add(p)
    return preds

def defined_classes(g: Graph) -> set[URIRef]:
    """All classes declared in the graph."""
    classes = set()
    for c_type in (RDFS.Class, OWL.Class):
        for s in g.subjects(RDF.type, c_type):
            if isinstance(s, URIRef):
                classes.add(s)
    return classes

def typed_classes(g: Graph) -> set[URIRef]:
    """All classes that occur as objects of rdf:type triples."""
    tclasses = set()
    for _, _, o in g.triples((None, RDF.type, None)):
        if isinstance(o, URIRef):
            tclasses.add(o)
    return tclasses


# ---------- Pass 2: coverage augmentation ----------

def augment_for_coverage(selected: list[Path], all_files: list[Path]) -> list[Path]:
    """
    Ensure:
      - Each defined predicate appears in at least MIN_EXAMPLES_PER_RELATION triples.
      - Each defined class has at least MIN_EXAMPLES_PER_RELATION instances.
    Iteratively add files that increase coverage.
    """
    if not (ENSURE_RELATION_COVERAGE or ENSURE_CLASS_INSTANCE_COVERAGE):
        return selected

    g = Graph()
    if VERBOSE:
        print("\nPass 2: Building initial graph for coverage checks...")
    for i, f in enumerate(selected, 1):
        try:
            g.parse(str(f), format="xml")
        except Exception:
            continue

    # Initialize coverage counts
    pred_count = defaultdict(int)
    cls_count = defaultdict(int)

    for s, p, o in g:
        if isinstance(p, URIRef):
            pred_count[p] += 1
        if p == RDF.type and isinstance(o, URIRef):
            cls_count[o] += 1

    defs_pred = defined_predicates(g) if ENSURE_RELATION_COVERAGE else set()
    defs_cls  = defined_classes(g)    if ENSURE_CLASS_INSTANCE_COVERAGE else set()

    def missing_preds():
        return {p for p in defs_pred if pred_count[p] < MIN_EXAMPLES_PER_RELATION}
    def missing_classes():
        return {c for c in defs_cls if cls_count[c] < MIN_EXAMPLES_PER_RELATION}

    miss_pred = missing_preds()
    miss_cls  = missing_classes()

    if VERBOSE:
        print(f"Initial missing predicates: {len(miss_pred)} | classes: {len(miss_cls)}")

    selected_set = set(selected)
    candidates = [f for f in all_files if f not in selected_set]

    added = 0
    for f in candidates:
        if MAX_SECOND_PASS_FILES and added >= MAX_SECOND_PASS_FILES:
            break
        if STOP_WHEN_FULLY_COVERED and not miss_pred and not miss_cls:
            break

        temp = Graph()
        try:
            temp.parse(str(f), format="xml")
        except Exception:
            continue

        adds_pred = False
        adds_cls  = False

        # Check predicates
        if ENSURE_RELATION_COVERAGE and miss_pred:
            for _, p, _ in temp:
                if p in miss_pred:
                    adds_pred = True
                    break

        # Check classes
        if ENSURE_CLASS_INSTANCE_COVERAGE and miss_cls and not adds_pred:
            for _, _, o in temp.triples((None, RDF.type, None)):
                if o in miss_cls:
                    adds_cls = True
                    break

        if not (adds_pred or adds_cls):
            continue

        g += temp
        selected.append(f)
        added += 1

        # Update counts
        for _, p, _ in temp:
            if isinstance(p, URIRef):
                pred_count[p] += 1
        for _, _, o in temp.triples((None, RDF.type, None)):
            if isinstance(o, URIRef):
                cls_count[o] += 1

        miss_pred = missing_preds()
        miss_cls  = missing_classes()

        if VERBOSE and (added % 50 == 0 or not miss_pred and not miss_cls):
            print(f"Added {added} files | missing preds: {len(miss_pred)} | missing classes: {len(miss_cls)}")

    if VERBOSE:
        print(f"Final added files: {added}")
        print(f"Remaining missing preds: {len(miss_pred)} | classes: {len(miss_cls)}")

    return selected


# ---------- Merge to TTL ----------

def merge_to_ttl(files: list[Path], out_path: str):
    g = Graph()
    total = len(files)
    if VERBOSE:
        print(f"\nMerging {total} files → {out_path}")
    for i, f in enumerate(files, 1):
        if VERBOSE and (i == 1 or i % 200 == 0 or i == total):
            print(f"[{i}/{total}] {f.name}")
        try:
            g.parse(str(f), format="xml")
        except Exception as e:
            print(f"⚠️  Skipping {f.name}: {e}")
    if VERBOSE:
        print(f"Triples in merged graph: {len(g)}")
        print("Writing Turtle...")
    os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
    g.serialize(destination=out_path, format="turtle")
    if VERBOSE:
        print("✅ Done.")


# ---------- Main ----------

def main():
    all_files = collect_files(INPUT_DIR, FILE_GLOB_PATTERN)
    if not all_files:
        print(f"No files found in {INPUT_DIR} matching {FILE_GLOB_PATTERN}")
        return

    # Pass 1: base selection
    selected = select_files_pass1(all_files)

    # Pass 2: ontology & usage coverage (optional)
    selected = augment_for_coverage(selected, all_files)

    # Final merge
    merge_to_ttl(selected, OUTPUT_TTL)

In [46]:
main()

Pass 1 selection complete.
Selected per category (incl. class/property):
  - marvel: 576
  - org: 556
Total selected: 1132

Pass 2: Building initial graph for coverage checks...
Initial missing predicates: 126 | classes: 167
Added 50 files | missing preds: 99 | missing classes: 125
Added 100 files | missing preds: 82 | missing classes: 81
Added 150 files | missing preds: 39 | missing classes: 40
Added 200 files | missing preds: 4 | missing classes: 5
Final added files: 207
Remaining missing preds: 1 | classes: 1

Merging 1339 files → big_one_SamCat2_Covered.ttl
[1/1339] dbkwik-webdatacommons-org-marvel-wikia-com-class-1st-real-name.xml
[200/1339] dbkwik-webdatacommons-org-marvel-wikia-com-property-box1image.xml
[400/1339] dbkwik-webdatacommons-org-marvel-wikia-com-resource-category-uncanny-x-men-first-class-vol-1-7-images.xml
[600/1339] marvel-wikia-com-wiki-special-filepath-5cnx-men-first-class-vol-1-4-textless-jpg.xml
[800/1339] marvel-wikia-com-wiki-special-filepath-janos-quested-ea