In [1]:
from pathlib import Path
import sys

def find_repo_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / ".git").exists():
            return p
    raise FileNotFoundError("Could not find repo root (no .git found).")

CWD = Path.cwd()
REPO = find_repo_root(CWD)

print("CWD:", CWD)
print("REPO:", REPO)
print("Python:", sys.executable)


CWD: /home/jaz3n/Repository/UMBC-DATA606-Capstone/notebooks
REPO: /home/jaz3n/Repository/UMBC-DATA606-Capstone
Python: /home/jaz3n/miniconda3/envs/rmf-assistant/bin/python


In [2]:
import subprocess

gen_script = REPO / "app/services/generate_synth_policies.py"
assert gen_script.exists(), f"Missing generator script: {gen_script}"

result = subprocess.run(
    [sys.executable, str(gen_script)],
    cwd=str(REPO),
    capture_output=True,
    text=True
)

print(result.stdout)
if result.returncode != 0:
    print(result.stderr)
    raise RuntimeError("Synthetic corpus generation failed")


Wrote: /home/jaz3n/Repository/UMBC-DATA606-Capstone/data/policies_synth_md/01_mini_ssp.md
Wrote: /home/jaz3n/Repository/UMBC-DATA606-Capstone/data/policies_synth_md/02_access_control_policy.md
Wrote: /home/jaz3n/Repository/UMBC-DATA606-Capstone/data/policies_synth_md/03_incident_response_plan.md
Wrote: /home/jaz3n/Repository/UMBC-DATA606-Capstone/data/policies_synth_md/04_logging_monitoring_standard.md



In [3]:
from pathlib import Path

v2 = REPO / "data/policies_synth_md_v2"
docs = sorted(v2.glob("0*.md"))

print("v2 dir:", v2)
print("docs:", [p.name for p in docs])
assert len(docs) >= 4, "Expected at least 4 v2 docs"


v2 dir: /home/jaz3n/Repository/UMBC-DATA606-Capstone/data/policies_synth_md_v2
docs: ['01_mini_ssp.md', '02_access_control_policy.md', '03_incident_response_plan.md', '04_logging_monitoring_standard.md']


In [4]:
import csv

truth_dir = REPO / "data/truth_table"
truth_dir.mkdir(parents=True, exist_ok=True)

schema_path = truth_dir / "controls_truth_schema.csv"
truth_path  = truth_dir / "controls_truth.csv"

FIELDS = ["control_id","expected_coverage","evidence_doc","evidence_location","gap_notes"]

OVERWRITE = False  # set True if you intentionally want to regenerate

if (not truth_path.exists()) or OVERWRITE:
    # schema file (header only)
    with schema_path.open("w", newline="", encoding="utf-8") as f:
        csv.writer(f).writerow(FIELDS)

    # starter truth table tied to v2 docs
    starter_rows = [
        ["AC-2","partial","02_access_control_policy.md","4.0 Policy Statements; 5.0 Procedures","Missing explicit recertification frequency and periodic review procedure."],
        ["AC-3","covered","02_access_control_policy.md","4.0 Policy Statements","Role-based access described; may need more enforcement detail later."],
        ["AC-6","covered","02_access_control_policy.md","4.0 Policy Statements","Least privilege explicitly stated; privileged separation included."],
        ["AU-2","covered","04_logging_monitoring_standard.md","4.0 Policy Statements","Defines log event types (auth, failures, privileged actions, config changes)."],
        ["AU-6","partial","04_logging_monitoring_standard.md","5.0 Procedures","Does not specify consistent review cadence for all event types; lacks metrics/KPIs."],
        ["AU-11","covered","04_logging_monitoring_standard.md","4.0 Policy Statements","Retention specified (90 days online, 1 year archived)."],
        ["IR-4","partial","03_incident_response_plan.md","4.0 Policy Statements; 5.0 Procedures","No escalation time objectives by severity; no exercise schedule."],
        ["IR-6","partial","03_incident_response_plan.md","4.0 Policy Statements","Stakeholder notification described but lacks explicit reporting timelines and criteria."],
        ["PL-2","partial","01_mini_ssp.md","4.0 Policy Statements; 7.0 Review Cadence","No defined continuous monitoring cadence; boundary change approvals not formalized."],
    ]

    with truth_path.open("w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(FIELDS)
        w.writerows(starter_rows)

    print("Wrote:", schema_path)
    print("Wrote:", truth_path)
else:
    print("Truth table already exists; not overwriting:", truth_path)


Truth table already exists; not overwriting: /home/jaz3n/Repository/UMBC-DATA606-Capstone/data/truth_table/controls_truth.csv


In [5]:
import pandas as pd

truth = pd.read_csv(truth_path)
v2 = REPO / "data/policies_synth_md_v2"

missing = []
for doc in sorted(set(truth["evidence_doc"].dropna())):
    if not (v2 / doc).exists():
        missing.append(doc)

print("Missing evidence docs referenced by truth table:", missing)
assert not missing, f"Truth table references missing v2 docs: {missing}"

truth.head(10)


Missing evidence docs referenced by truth table: []


Unnamed: 0,control_id,expected_coverage,evidence_doc,evidence_location,gap_notes
0,AC-2,partial,02_access_control_policy.md,4.0 Policy Statements; 5.0 Procedures,Missing explicit recertification frequency and...
1,AC-3,covered,02_access_control_policy.md,4.0 Policy Statements,Least privilege and role-based access describe...
2,AC-6,covered,02_access_control_policy.md,4.0 Policy Statements,Least privilege explicitly stated; separation ...
3,AU-2,covered,04_logging_monitoring_standard.md,4.0 Policy Statements,"Defines log event types (auth, failures, privi..."
4,AU-6,partial,04_logging_monitoring_standard.md,5.0 Procedures,Does not specify consistent review cadence for...
5,AU-11,covered,04_logging_monitoring_standard.md,4.0 Policy Statements,"Retention specified (90 days online, 1 year ar..."
6,IR-4,partial,03_incident_response_plan.md,4.0 Policy Statements; 5.0 Procedures,No escalation time objectives by severity; no ...
7,IR-6,partial,03_incident_response_plan.md,4.0 Policy Statements,Stakeholder notification described but lacks e...
8,PL-2,partial,01_mini_ssp.md,4.0 Policy Statements; 7.0 Review Cadence,No defined continuous monitoring cadence; boun...
9,CM-2,missing,01_mini_ssp.md,,Baseline configuration management not describe...
