In [4]:
from pathlib import Path
import sys

def find_repo_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / ".git").exists():
            return p
    raise FileNotFoundError("Repo root not found")

REPO = find_repo_root(Path.cwd())
REPO

CWD = Path.cwd()

print("CWD:", CWD)
print("REPO:", REPO)
print("Python:", sys.executable)


CWD: /home/jaz3n/Repository/UMBC-DATA606-Capstone/notebooks
REPO: /home/jaz3n/Repository/UMBC-DATA606-Capstone
Python: /home/jaz3n/miniconda3/envs/rmf-assistant/bin/python


In [5]:
assert (REPO / "data").exists(), f"REPO detection failed: {REPO}"


In [6]:
import pandas as pd

controls_path = REPO / "data/oscal_parsed/controls_80053.parquet"
print("Controls parquet exists:", controls_path.exists())
print("Path:", controls_path)

df = pd.read_parquet(controls_path)
print("controls_80053 shape:", df.shape)
print("Columns:", df.columns.tolist())

print("\nSample rows (id/family/title):")
print(df[["control_id","family","title"]].head(10).to_string(index=False))

for cid in ["AC-2","AU-2","IR-4"]:
    row = df[df["control_id"] == cid]
    print(f"\n== {cid} ==")
    print("Found:", not row.empty)
    if not row.empty:
        print("Title:", row.iloc[0]["title"])
        s = row.iloc[0]["statement"] or ""
        print("Statement (first 300 chars):", s[:300], "...")


Controls parquet exists: True
Path: /home/jaz3n/Repository/UMBC-DATA606-Capstone/data/oscal_parsed/controls_80053.parquet
controls_80053 shape: (324, 10)
Columns: ['control_id', 'title', 'family', 'statement', 'guidance', 'enhancements', 'parameters', 'source', 'source_file', 'oscal_path']

Sample rows (id/family/title):
control_id family                        title
      AC-1     AC        Policy and Procedures
      AC-2     AC           Account Management
      AC-3     AC           Access Enforcement
      AC-4     AC Information Flow Enforcement
      AC-5     AC         Separation of Duties
      AC-6     AC              Least Privilege
      AC-7     AC  Unsuccessful Logon Attempts
      AC-8     AC      System Use Notification
      AC-9     AC  Previous Logon Notification
     AC-10     AC   Concurrent Session Control

== AC-2 ==
Found: True
Title: Account Management
Statement (first 300 chars): [statement]
  [item] Define and document the types of accounts allowed and specif

In [7]:
docs_dir = REPO / "data/policies_synth_md_v2"
docs = sorted(docs_dir.glob("0*.md"))

print("Synthetic v2 dir exists:", docs_dir.exists())
print("Path:", docs_dir)
print("Docs:", [p.name for p in docs])

assert len(docs) >= 4, "Expected at least 4 synthetic policy docs in v2"

print("\nQuick summaries:")
for p in docs:
    txt = p.read_text(encoding="utf-8", errors="ignore")
    first = txt.splitlines()[0] if txt else ""
    print(f"- {p.name}: chars={len(txt)} | first_line={first}")


Synthetic v2 dir exists: True
Path: /home/jaz3n/Repository/UMBC-DATA606-Capstone/data/policies_synth_md_v2
Docs: ['01_mini_ssp.md', '02_access_control_policy.md', '03_incident_response_plan.md', '04_logging_monitoring_standard.md']

Quick summaries:
- 01_mini_ssp.md: chars=1656 | first_line=# Mini-SSP / System Overview — Rivermark Operations Portal (ROP) (Fictional)
- 02_access_control_policy.md: chars=1523 | first_line=# Access Control Policy — Rivermark Operations Portal (ROP) (Fictional)
- 03_incident_response_plan.md: chars=1457 | first_line=# Incident Response Plan — Rivermark Operations Portal (ROP) (Fictional)
- 04_logging_monitoring_standard.md: chars=1285 | first_line=# Logging & Monitoring Standard — Rivermark Operations Portal (ROP) (Fictional)


In [8]:
docling_md = REPO / "data/policies_synth_md/_docling_test.md"
print("Docling MD exists:", docling_md.exists())
print("Path:", docling_md)

assert docling_md.exists(), "Docling output missing"

print("\nDocling preview:")
print("\n".join(docling_md.read_text(encoding="utf-8", errors="ignore").splitlines()[:15]))


Docling MD exists: True
Path: /home/jaz3n/Repository/UMBC-DATA606-Capstone/data/policies_synth_md/_docling_test.md

Docling preview:
# Access Control Policy - Rivermark Operations Portal (ROP) (Fictional)

> **SYNTHETIC DEMO ARTIFACT - ACADEMIC USE ONLY**

**Document ID:** 02\_ACCESS\_CONTROL\_POLICY

- **Version:** 0.1
- **Effective Date:** 2026-02-11
- **Owner:** Identity and Access Management (IAM) Lead (Fictional Role)
- **Applies To:** Rivermark Operations Portal (ROP) (Fictional)

---

## 1.0 Purpose



In [9]:
truth_dir = REPO / "data/truth_table"
truth_csv = truth_dir / "controls_truth.csv"
truth_schema = truth_dir / "controls_truth_schema.csv"

print("Truth dir:", truth_dir)
print("controls_truth.csv exists:", truth_csv.exists(), "| bytes:", (truth_csv.stat().st_size if truth_csv.exists() else None))
print("controls_truth_schema.csv exists:", truth_schema.exists(), "| bytes:", (truth_schema.stat().st_size if truth_schema.exists() else None))

truth = pd.read_csv(truth_csv)
print("Truth table shape:", truth.shape)
print(truth.head(12).to_string(index=False))


Truth dir: /home/jaz3n/Repository/UMBC-DATA606-Capstone/data/truth_table
controls_truth.csv exists: True | bytes: 1716
controls_truth_schema.csv exists: True | bytes: 71
Truth table shape: (12, 5)
control_id expected_coverage                      evidence_doc                         evidence_location                                                                                gap_notes
      AC-2           partial       02_access_control_policy.md     4.0 Policy Statements; 5.0 Procedures     Missing explicit recertification frequency and documented periodic review procedure.
      AC-3           covered       02_access_control_policy.md                     4.0 Policy Statements Least privilege and role-based access described; may need more enforcement detail later.
      AC-6           covered       02_access_control_policy.md                     4.0 Policy Statements          Least privilege explicitly stated; separation for privileged accounts included.
      AU-2           covere

In [10]:
print("\n=== Week 1 Smoke Test Summary ===")
print("OSCAL controls:", df.shape[0], "rows")
print("Synthetic docs (v2):", len(docs))
print("Docling md:", "OK" if docling_md.exists() else "MISSING")
print("Truth rows:", truth.shape[0])



=== Week 1 Smoke Test Summary ===
OSCAL controls: 324 rows
Synthetic docs (v2): 4
Docling md: OK
Truth rows: 12
