In [None]:
import pandas as pd

In [None]:
pivot_file = "scf_2025_1.1.xlsx"

df = pd.read_excel(pivot_file, sheet_name="SCF 2025.1.1")

# two types of nodes: requirement (name, id, description, urn) and framework (urn, name)
# two types of relationships: partOf linking a requirement to its parent framework, and interesect linking some requirements to a SCF requirement

# we create a node for framework SCF
# we go over column C and create a node for each item, and put the node name as the cell content, node framework as SCF and content of column B as description
# we create a link from each of the previous nodes to the SCF framework node
# then for each column from AB to JR
# we create a framework node for each column, framework name is column name and urn the slugified name
# per column, for each cell, we split the multiline content and create a requirement node for each one, the name is the id that we're getting, the framework is the column name
# we create a link for each requirement node to the framework
# now, each time the cell of a framework is not empty and we've created nodes for the id resulting of the multiline split, we also create a link to the SCF requirement node that has its reference on the same row on column C

In [15]:
# ─────────────── Cell 1: CSV generation ───────────────

import pandas as pd
from slugify import slugify
from openpyxl.utils import column_index_from_string

# Read pivot and locate cols
df = pd.read_excel(pivot_file, sheet_name="SCF 2025.1.1")
desc_idx = column_index_from_string("B") - 1
main_idx = column_index_from_string("C") - 1
start, end = (column_index_from_string("AB") - 1, column_index_from_string("JR") - 1)
framework_cols = df.columns[start : end + 1]

frameworks_rows = []
requirements_rows = []
partof_rows = []
intersect_rows = []

# SCF framework node
frameworks_rows.append({"urn": "scf", "name": "SCF"})

for _, row in df.iterrows():
    # main SCF requirement
    main_val = row.iloc[main_idx]
    if pd.isna(main_val):
        continue
    main_raw = str(main_val).strip().replace("\n", "-")
    main_urn = f"scf-{slugify(main_raw)}"
    main_desc = str(row.iloc[desc_idx]).strip()
    requirements_rows.append(
        {"urn": main_urn, "id": main_raw, "name": main_raw, "framework": "SCF"}
    )
    partof_rows.append({"from": main_urn, "to": "scf"})

    # for each non‑SCF framework column
    for col in framework_cols:
        fw_name = col.strip().replace("\n", "-")
        fw_urn = slugify(fw_name)
        frameworks_rows.append({"urn": fw_urn, "name": fw_name})

        cell = row[col]
        if pd.isna(cell):
            continue

        for part in str(cell).split("\n"):
            sub = part.strip()
            if not sub:
                continue
            sub_urn = f"{fw_urn}-{slugify(sub)}"
            requirements_rows.append(
                {"urn": sub_urn, "id": sub, "name": sub, "framework": fw_name}
            )
            partof_rows.append({"from": sub_urn, "to": fw_urn})
            # intersect: sub → main SCF requirement
            intersect_rows.append({"from": sub_urn, "to": main_urn})

# Dump de‑duplicated CSVs
pd.DataFrame(frameworks_rows).drop_duplicates(subset=["urn"]).to_csv(
    "frameworks.csv", index=False, sep=";"
)

pd.DataFrame(requirements_rows).drop_duplicates(subset=["urn"]).to_csv(
    "requirements.csv", index=False, sep=";"
)

pd.DataFrame(partof_rows).drop_duplicates(subset=["from", "to"]).to_csv(
    "partof.csv", index=False, sep=";"
)

pd.DataFrame(intersect_rows).drop_duplicates(subset=["from", "to"]).to_csv(
    "intersect.csv", index=False, sep=";"
)

In [17]:
# ─────────────── Cell 2: Kùzu schema + batch load ───────────────

import kuzu

db = kuzu.Database("./scf_db")
conn = kuzu.Connection(db)

# Drop old tables
for tbl in ["Intersect", "PartOf", "Requirements", "Frameworks"]:
    conn.execute(f"DROP TABLE IF EXISTS {tbl};")

# Create node tables (plural)
conn.execute("""
CREATE NODE TABLE Frameworks(
  urn STRING,
  name STRING,
  PRIMARY KEY (urn)
);
""")
conn.execute("""
CREATE NODE TABLE Requirements(
  urn STRING,
  id STRING,
  name STRING,
  framework STRING,
  PRIMARY KEY (urn)
);
""")

# Create rel tables (singular)
conn.execute("""
CREATE REL TABLE PartOf(
  FROM Requirements TO Frameworks
);
""")
conn.execute("""
CREATE REL TABLE Intersect(
  FROM Requirements TO Requirements
);
""")

# Bulk‑load CSVs (disable parallel parsing)
for tbl, csv in [
    ("Frameworks", "frameworks.csv"),
    ("Requirements", "requirements.csv"),
    ("PartOf", "partof.csv"),
    ("Intersect", "intersect.csv"),
]:
    conn.execute(f"COPY {tbl} FROM \"{csv}\" (HEADER=true, PARALLEL=false, DELIM=';');")