In [10]:
# Cell A — robust RDF/XML repair with per-IRI callback + scanner
import re, json, sys
from pathlib import Path
from typing import Dict, Any

# Find IRIs that start with http(s):// and continue until a common delimiter.
IRI_RE = re.compile(r'(?i)\bhttps?://[^\s<>"\']+')

def _fix_one_iri(iri: str) -> str:
    """
    Fix common issues inside a single http(s) IRI:
    - http:/// -> http://   (and https)
    - host:PATH  -> host/PATH   (but keep host:PORT when : is followed by a digit)
    - backslashes -> forward slashes
    - spaces -> %20
    """
    orig = iri

    # 1) Collapse http:///... or https:///... to http://... / https://...
    iri = re.sub(r'(?i)\bhttp:/{3,}', 'http://', iri)
    iri = re.sub(r'(?i)\bhttps:/{3,}', 'https://', iri)

    # 2) Replace backslashes with forward slashes inside IRIs
    #    (backslash is illegal in URIs; many crawlers dump them)
    iri = iri.replace('\\', '/')

    # 3) If there's a colon immediately after the host that is NOT a port, turn it into a slash
    #    Split scheme://host[:...]
    m = re.match(r'(?i)^(https?://)([^/:?#]+)(:)(.*)$', iri)
    if m:
        scheme, host, colon, rest = m.groups()
        # If the character after colon starts with a DIGIT -> it's a port -> keep it.
        if rest and not rest[0].isdigit():
            iri = scheme + host + '/' + rest  # turn host:PATH into host/PATH

    # 4) Replace raw spaces inside IRI with %20
    #    (we only work on the matched IRI, so we won't touch XML around it)
    iri = iri.replace(' ', '%20')

    return iri

def repair_rdfxml(in_path: Path, out_path: Path, capture_samples: int = 10) -> Dict[str, Any]:
    suspicious_lines = []
    total_lines = 0
    fixes = {
        "fixed_http_triple_slash": 0,
        "fixed_https_triple_slash": 0,
        "fixed_host_bad_colon": 0,
        "fixed_backslashes": 0,
        "fixed_spaces": 0
    }

    # Quick counters by comparing before/after at IRI level
    def count_diffs(before: str, after: str):
        if before != after:
            if re.search(r'(?i)\bhttp:/{3,}', before): fixes["fixed_http_triple_slash"] += 1
            if re.search(r'(?i)\bhttps:/{3,}', before): fixes["fixed_https_triple_slash"] += 1
            if re.search(r'(?i)^https?://[^/:?#]+:(?!\d)', before): fixes["fixed_host_bad_colon"] += 1
            if '\\' in before: fixes["fixed_backslashes"] += 1
            if ' ' in before:  fixes["fixed_spaces"] += 1

    with in_path.open("r", encoding="utf-8", errors="replace") as fin, \
         out_path.open("w", encoding="utf-8", errors="replace") as fout:

        for line in fin:
            total_lines += 1

            # Collect suspicious lines (before fix) for preview
            if len(suspicious_lines) < capture_samples and (
                "http:///" in line or "https:///" in line or ":\\" in line or "://" in line and ":s" in line  # crude heuristics
            ):
                suspicious_lines.append(line.rstrip()[:300])

            # Replace each IRI via callback
            def _cb(m):
                before = m.group(0)
                after  = _fix_one_iri(before)
                count_diffs(before, after)
                return after

            newline = IRI_RE.sub(_cb, line)
            fout.write(newline)

    return {
        "total_lines": total_lines,
        "fix_counts": fixes,
        "suspicious_samples": suspicious_lines,
        "output": str(out_path)
    }

def scan_bad_host_colon(path: Path, limit: int = 20):
    """
    Scan file for patterns like http://host:PATH (non-port).
    """
    pat = re.compile(r'(?i)\bhttps?://[^/:?#\s<>"\']+:(?!\d)')
    hits = []
    with path.open("r", encoding="utf-8", errors="replace") as f:
        for i, line in enumerate(f, 1):
            if pat.search(line):
                if len(hits) < limit:
                    hits.append((i, line.strip()[:300]))
    print(f"[SCAN] Found {len(hits)} 'host:PATH' candidates (showing up to {limit}):")
    for ln, snippet in hits:
        print(f"  line {ln}: {snippet}")

In [11]:
# Cell B
fixed_path = INPUT_FILE.with_suffix(INPUT_FILE.suffix + ".fixed.rdf")
stats = repair_rdfxml(INPUT_FILE, fixed_path)
print("[INFO] Repair summary:")
print(json.dumps(stats, indent=2))

print("\n[SCAN] After-fix scan on the repaired file:")
scan_bad_host_colon(fixed_path)

upload_file = fixed_path  # ensure we upload the new repaired file

[INFO] Repair summary:
{
  "total_lines": 9009660,
  "fix_counts": {
    "fixed_http_triple_slash": 6,
    "fixed_https_triple_slash": 0,
    "fixed_host_bad_colon": 1,
    "fixed_backslashes": 0,
    "fixed_spaces": 0
  },
  "suspicious_samples": [
    "    <ns8:subject rdf:resource=\"http://dbkwik.webdatacommons.org/marvel.wikia.com/resource/Category:Invulnerability\"/>",
    "    <ns8:subject rdf:resource=\"http://dbkwik.webdatacommons.org/marvel.wikia.com/resource/Category:Acoustikinesis\"/>",
    "    <ns8:subject rdf:resource=\"http://dbkwik.webdatacommons.org/marvel.wikia.com/resource/Category:Vibranium_Mutates\"/>",
    "    <ns8:subject rdf:resource=\"http://dbkwik.webdatacommons.org/marvel.wikia.com/resource/Category:Superhuman_Strength\"/>",
    "    <ns8:subject rdf:resource=\"http://dbkwik.webdatacommons.org/marvel.wikia.com/resource/Category:Midtown_High_School_Student\"/>",
    "    <ns8:subject rdf:resource=\"http://dbkwik.webdatacommons.org/marvel.wikia.com/resource/Ca

In [None]:
# Cell 2
FUSEKI_BASE = "localhost:3030"   # your Fuseki base
DATASET     = "temp_chris_thesis2"                  # your dataset
INPUT_FILE  = Path("datasets/big_one.xml")                      # your RDF/XML file

# Optional: named graph target (set to None for default graph)
NAMED_GRAPH = None

# Choose upload mode: "POST" (append/merge) or "PUT" (replace)
METHOD = "POST"

# Whether to repair http:/// and https:/// first
DO_REPAIR = True

# Whether to run a SPARQL COUNT(*) after upload
DO_VERIFY = True

In [13]:
# Cell B
fixed_path = INPUT_FILE.with_suffix(INPUT_FILE.suffix + ".fixed.rdf")
stats = repair_rdfxml(INPUT_FILE, fixed_path)
print("[INFO] Repair summary:")
print(json.dumps(stats, indent=2))

print("\n[SCAN] After-fix scan on the repaired file:")
scan_bad_host_colon(fixed_path)

upload_file = fixed_path  # ensure we upload the new repaired file

[INFO] Repair summary:
{
  "total_lines": 9009660,
  "fix_counts": {
    "fixed_http_triple_slash": 6,
    "fixed_https_triple_slash": 0,
    "fixed_host_bad_colon": 1,
    "fixed_backslashes": 0,
    "fixed_spaces": 0
  },
  "suspicious_samples": [
    "    <ns8:subject rdf:resource=\"http://dbkwik.webdatacommons.org/marvel.wikia.com/resource/Category:Invulnerability\"/>",
    "    <ns8:subject rdf:resource=\"http://dbkwik.webdatacommons.org/marvel.wikia.com/resource/Category:Acoustikinesis\"/>",
    "    <ns8:subject rdf:resource=\"http://dbkwik.webdatacommons.org/marvel.wikia.com/resource/Category:Vibranium_Mutates\"/>",
    "    <ns8:subject rdf:resource=\"http://dbkwik.webdatacommons.org/marvel.wikia.com/resource/Category:Superhuman_Strength\"/>",
    "    <ns8:subject rdf:resource=\"http://dbkwik.webdatacommons.org/marvel.wikia.com/resource/Category:Midtown_High_School_Student\"/>",
    "    <ns8:subject rdf:resource=\"http://dbkwik.webdatacommons.org/marvel.wikia.com/resource/Ca

In [14]:
# Cell C
gsp = graph_store_url(FUSEKI_BASE, DATASET, NAMED_GRAPH)
print(f"[INFO] Uploading via {METHOD} to {gsp} ...")

resp = upload_rdfxml(
    gsp,
    upload_file,
    method=METHOD
)

ok = resp.status_code in (200, 201, 204)
print(f"[RESULT] HTTP {resp.status_code}")
if not ok:
    print(resp.text[:2000])
    raise RuntimeError("Upload failed.")
else:
    print("✅ Upload successful.")

[INFO] Uploading via POST to http://arsenal.cs.wright.edu:3030/temp_chris_thesis2/data ...
[RESULT] HTTP 200
✅ Upload successful.


In [15]:
# Cell D
if DO_VERIFY:
    sp = sparql_url(FUSEKI_BASE, DATASET)
    count = verify_count(sp, NAMED_GRAPH, USERNAME, PASSWORD)
    if count is not None:
        target = NAMED_GRAPH or "default graph"
        print(f"[VERIFY] Triple count in {target}: {count}")
    else:
        print("[VERIFY] Skipped or failed.")
else:
    print("[VERIFY] Disabled.")

[VERIFY] Triple count in default graph: 6733366
