Split an RDF/XML file into one file per <rdf:Description>, preserving the exact
header (everything before the first <rdf:Description>) in each output.

- Input:  .xml
- Output: .xml files in OUT_DIR, each with:
    [original header text]
    <rdf:Description ...> ... </rdf:Description>
    </rdf:RDF>

No assumptions about "13 lines" â€” the header is detected dynamically.

In [None]:
import os
import re

# --- CONFIG ---
INPUT_FILE = "datasets/small_one.xml"          # path to your big RDF/XML
OUT_DIR    = "datasets/small_one"   # destination folder for per-subject XMLs
BASENAME   = "subject"            # fallback base name if rdf:about not present
# -------------

DESC_START_RE = re.compile(r"<rdf:Description\b", re.IGNORECASE)
DESC_END_RE   = re.compile(r"</rdf:Description\s*>", re.IGNORECASE)
ABOUT_ATTR_RE = re.compile(r'rdf:about\s*=\s*"(.*?)"')  # capture subject URI

In [6]:
def slugify(value: str, maxlen: int = 160) -> str:
    # Make a filesystem-safe name from the subject URI
    v = value
    v = re.sub(r"^[a-z]+://", "", v, flags=re.I)
    v = v.strip().lower()
    v = re.sub(r"[^a-z0-9]+", "-", v).strip("-")
    return (v or BASENAME)[:maxlen]

def ensure_dir(path: str):
    os.makedirs(path, exist_ok=True)

def write_one_output(header_text: str, block_lines: list[str], idx: int):
    # Try to extract rdf:about for file naming (prefer from the first line of block)
    first = block_lines[0] if block_lines else ""
    m = ABOUT_ATTR_RE.search(first)
    if not m:
        # Search entire block if needed
        joined = "".join(block_lines)
        m = ABOUT_ATTR_RE.search(joined)
    name = slugify(m.group(1)) if m else f"{BASENAME}-{idx:06d}"
    out_path = os.path.join(OUT_DIR, f"{name}.xml")

    with open(out_path, "w", encoding="utf-8", newline="") as fout:
        fout.write(header_text)
        fout.writelines(block_lines)
        # Ensure newline before closing tag if the block didn't end with one
        if not (block_lines and block_lines[-1].endswith("\n")):
            fout.write("\n")
        fout.write("</rdf:RDF>\n")
    return out_path

In [7]:
def main():
    ensure_dir(OUT_DIR)

    with open(INPUT_FILE, "r", encoding="utf-8", newline="") as fin:
        header_chunks: list[str] = []
        first_desc_found = False
        carry = ""  # to handle cases where <rdf:Description> starts mid-line

        # 1) Read until the FIRST <rdf:Description> appears; everything before is header
        while True:
            line = fin.readline()
            if line == "":
                raise RuntimeError("No <rdf:Description> found in the file.")
            test = carry + line
            m = DESC_START_RE.search(test)
            if m:
                # Split: header_text = before the first <rdf:Description>
                start_idx = m.start()
                header_text = "".join(header_chunks) + test[:start_idx]
                # Normalize: make sure header_text ends *before* any descriptions
                # We'll start processing descriptions beginning at the match
                remainder = test[start_idx:]
                first_desc_found = True
                break
            else:
                header_chunks.append(test)
                carry = ""  # we've committed carry into header

        # Optional sanity: ensure header contains the opening <rdf:RDF ...> and not its closing
        if "</rdf:RDF>" in header_chunks[-1]:
            raise RuntimeError("Detected </rdf:RDF> in header; input may be malformed.")

        # 2) Process all <rdf:Description> blocks from remainder + rest of file
        outputs = 0
        current_block: list[str] = []
        in_block = False

        def flush_block():
            nonlocal outputs, current_block
            if current_block:
                out_path = write_one_output(header_text, current_block, outputs + 1)
                outputs += 1
                current_block = []
                return out_path
            return None

        # Helper to process text chunk that may contain multiple blocks
        def feed_text(chunk: str):
            nonlocal in_block, current_block
            pos = 0
            while pos < len(chunk):
                if not in_block:
                    mstart = DESC_START_RE.search(chunk, pos)
                    if not mstart:
                        # no new block start in the rest of this chunk
                        return
                    # start a new block at this index
                    in_block = True
                    current_block.append(chunk[mstart.start():])
                    # move pos to end (we've appended the rest; we'll check for end in next step)
                    pos = len(chunk)
                else:
                    # we are inside a block; look for its end
                    mend = DESC_END_RE.search(chunk, pos)
                    if mend:
                        # Append up to and including the end tag to the current block
                        current_block[-1] += chunk[pos:mend.end()]
                        # If there is trailing content after the end tag on this same chunk, keep it
                        tail = chunk[mend.end():]
                        # Flush the completed block
                        flush_block()
                        in_block = False
                        # If tail contains another start, loop will catch it (since pos < len(tail))
                        chunk = tail
                        pos = 0
                        continue
                    else:
                        # No end in this chunk; just append and wait for more
                        current_block[-1] += chunk[pos:]
                        pos = len(chunk)

        # Feed the remainder that began at the first <rdf:Description>
        feed_text(remainder)

        # Stream the rest of the file
        for line in fin:
            feed_text(line)

        # 3) If the file ended while inside a block, finalize it
        if in_block:
            flush_block()

    print(f"Done. Wrote {outputs} files to: {OUT_DIR}")

In [8]:
main()

Done. Wrote 137784 files to: dataset/small_one
