## Sanity Check 1


In [6]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from graph.workflow import build_workflow

app = build_workflow()

In [None]:
state = {
    "question": "spark executor oom error",
    "documents": [],
    "retry_generation_count": 0,
    "kb_retry_count": 0,
    "kb_enriched": True,
    "steps": []
}

result = app.invoke(state)
print(result["steps"])


['intent:TROUBLESHOOTING', 'gt_retrieve', 'gt_relevance', 'gt_generate', 'gt_validate', 'gt_store_tribal']


## Sanity Check 2

In [2]:
state = {
    "question": "spark executor oom error",
    "intent": "DATA_ENGINEERING",
    "documents": [],
    "generation": "",
    "is_grounded": "no",
    "is_relevant": "yes",
    "retry_generation_count": 0,
    "kb_retry_count": 0,
    "kb_enriched": True,
    "steps": []
}

result = app.invoke(state)
print(result["steps"])


['intent:DATA_ENGINEERING', 'de_context_router', 'de_retrieve', 'de_relevance', 'de_generate', 'de_hallucination']


## Sanity Check 3 after adding GT_Validation

In [2]:
state = {
    "question": "No space left on device",
    "intent": "TROUBLESHOOTING",
    "documents": ["dummy doc"],  # simulate KB hit
    "steps": []
}


result = app.invoke(state)
print(result["steps"])


['intent:TROUBLESHOOTING', 'gt_retrieve', 'gt_relevance', 'gt_generate:fallback=False']


In [3]:
state = {
    "question": "No space left on device",
    "intent": "TROUBLESHOOTING",
    "generation": "This might be a disk issue, probably check something",
    "documents": [],
    "steps": []
}

result = app.invoke(state)
print(result["steps"])

['intent:TROUBLESHOOTING', 'gt_retrieve', 'gt_relevance', 'gt_generate:fallback=True']


## Sanity Check 4 after SOP Implementation

In [2]:
state = {
    "question": "How do I onboard a new data source?",
    "intent": "SOP_QUERY",
    "documents": [],
    "steps": []
}
result = app.invoke(state)
print(result["steps"])

['intent:SOP_QUERY', 'sop_retrieve:no_vectorstore', 'sop_relevance:no_docs', 'sop_generate:no_docs']


### Seeding a Dummy SOP 

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

from resources.vectorstore import get_vectorstore


PDF_PATH = r"C:\Users\ddev\Downloads\Updated Names SOP.pdf"

def seed_sop_from_pdf():
    loader = PyPDFLoader(PDF_PATH)
    pages = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=150,
    )

    chunks = splitter.split_documents(pages)

    sop_docs = []
    for chunk in chunks:
        sop_docs.append(
            Document(
                page_content=chunk.page_content,
                metadata={
                    "type": "SOP",
                    "source": "pdf",
                    "filename": "onboarding_sop.pdf",
                    "topic": "data_source_onboarding",
                    "owner": "data-platform",
                    "version": "1.0",
                    "page": chunk.metadata.get("page"),
                },
            )
        )

    vectorstore = get_vectorstore("sop_knowledge")
    vectorstore.add_documents(sop_docs)

    print(f"✅ Seeded {len(sop_docs)} SOP chunks from PDF")



seed_sop_from_pdf()


✅ Seeded 32 SOP chunks from PDF


In [11]:
state = {
    "question": "Your pipeline fails at 3:00 AM. It is classified as Severity 2 (non-critical pipeline failure). What is your REQUIRED response time?",
    "intent": "SOP_QUERY",
    "steps": []
}

result = app.invoke(state)

print(result["steps"])
print("\nANSWER:\n", result["generation"])


['intent:SOP_QUERY', 'sop_retrieve', 'sop_relevance:yes', 'sop_generate']

ANSWER:
 - Response Time: 1 hour
