
### Step 1: PDF Processing

In [None]:
!pip install pdfplumber

In [4]:
import pdfplumber
import json
import re
from pathlib import Path
from typing import Dict, List, Optional
from collections import defaultdict

BASE_DIR = Path.cwd()
RAW_DIR = BASE_DIR / "Data" / "raw"
PROCESSED_DIR = BASE_DIR / "Data" / "processed"
METADATA_DIR = BASE_DIR / "Data" / "metadata"

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
METADATA_DIR.mkdir(parents=True, exist_ok=True)

print(f"Raw directory: {RAW_DIR}")
print(f"Processed directory: {PROCESSED_DIR}")
print(f"Metadata directory: {METADATA_DIR}")


Raw directory: /Users/ronny/Desktop/ML projects/DisasterOps/Data/raw
Processed directory: /Users/ronny/Desktop/ML projects/DisasterOps/Data/processed
Metadata directory: /Users/ronny/Desktop/ML projects/DisasterOps/Data/metadata


In [5]:
def get_citation_prefix(pdf_path: Path) -> str:
    """Generate citation prefix from file path"""
    category = pdf_path.parent.name
    filename = pdf_path.stem.lower()
    
    if category == "ics_forms":
        # Extract form number (e.g., "ics_form_201" -> "fema_ics201")
        match = re.search(r'ics_form_(\d+)', filename)
        if match:
            return f"fema_ics{match.group(1)}"
        elif "booklet" in filename:
            return "fema_ics_forms_booklet"
        elif "reference" in filename:
            return "fema_ics_reference"
    elif category == "cert":
        return "cert_basic_training_manual"
    elif category == "ready_gov":
        # Map filenames to citation prefixes
        if "are_you_ready" in filename:
            return "ready_gov_are_you_ready"
        elif "caregivers" in filename:
            return "ready_gov_caregivers"
        elif "earthquake" in filename:
            return "ready_gov_earthquake"
        elif "flood" in filename:
            return "ready_gov_flood"
        elif "hurricane" in filename:
            return "ready_gov_hurricane"
        elif "tornado" in filename:
            return "ready_gov_tornado"
        elif "wildfire" in filename:
            return "ready_gov_wildfire"
    elif category == "fema_other":
        if "nims_doctrine" in filename:
            return "fema_nims_doctrine"
        elif "nrf" in filename:
            return "fema_nrf"
        elif "nims" in filename:
            return f"fema_nims_{filename.replace('nims-', '').replace('_', '_')}"
    
    return f"{category}_{filename}"

def extract_sections(text: str) -> List[Dict]:
    """Extract sections based on headers and track page numbers"""
    lines = text.split('\n')
    sections = []
    current_section = {"title": "Introduction", "content": [], "page": 1}
    current_page = 1
    
    # Patterns for section headers
    header_patterns = [
        r'^(Chapter\s+\d+|Section\s+\d+|Unit\s+\d+)',
        r'^[A-Z][A-Z\s]{10,}$',  # ALL CAPS lines
        r'^\d+\.\s+[A-Z]',  # Numbered sections
        r'^[IVX]+\.\s+[A-Z]',  # Roman numerals
    ]
    
    for line in lines:
        # Extract page number from markers
        page_match = re.search(r'--- Page (\d+) ---', line)
        if page_match:
            current_page = int(page_match.group(1))
            continue
        
        line_stripped = line.strip()
        if not line_stripped:
            if current_section["content"]:
                current_section["content"].append("")
            continue
        
        # Check if line looks like a header
        is_header = any(re.match(pattern, line_stripped) for pattern in header_patterns)
        
        if is_header and len(line_stripped) < 100 and len(current_section["content"]) > 5:
            # Save current section
            sections.append({
                "title": current_section["title"],
                "content": "\n".join(current_section["content"]).strip(),
                "page": current_section["page"]
            })
            # Start new section
            current_section = {"title": line_stripped, "content": [], "page": current_page}
        else:
            current_section["content"].append(line_stripped)
            if current_section["page"] == 1 and current_page > 1:
                current_section["page"] = current_page
    
    # Add final section
    if current_section["content"]:
        sections.append({
            "title": current_section["title"],
            "content": "\n".join(current_section["content"]).strip(),
            "page": current_section["page"]
        })
    
    return sections if sections else [{"title": "Full Document", "content": text, "page": 1}]


In [6]:
def process_pdf(pdf_path: Path) -> Dict:
    """Process a single PDF and extract structured content"""
    citation_prefix = get_citation_prefix(pdf_path)
    print(f"Processing: {pdf_path.name} -> {citation_prefix}")
    
    full_text = ""
    pages_content = []
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages, 1):
                text = page.extract_text() or ""
                full_text += f"\n\n--- Page {page_num} ---\n\n{text}"
                pages_content.append({
                    "page": page_num,
                    "text": text
                })
    except Exception as e:
        print(f"Error processing {pdf_path.name}: {e}")
        return None
    
    # Extract sections
    sections = extract_sections(full_text)
    
    # Create citation mappings
    citations = []
    section_idx = 1
    
    for section in sections:
        start_page = section.get("page", 1)
        citation_id = f"{citation_prefix}_section{section_idx}_p{start_page}"
        citations.append({
            "citation_id": citation_id,
            "section_title": section["title"],
            "section_number": section_idx,
            "start_page": start_page,
            "content_preview": section["content"][:200] + "..." if len(section["content"]) > 200 else section["content"]
        })
        section_idx += 1
    
    # Clean full_text (remove page markers for cleaner storage)
    clean_text = re.sub(r'--- Page \d+ ---\n\n', '\n', full_text)
    
    return {
        "source_file": str(pdf_path.relative_to(BASE_DIR)),
        "citation_prefix": citation_prefix,
        "total_pages": len(pages_content),
        "full_text": clean_text,
        "pages": pages_content,
        "sections": sections,
        "citations": citations,
        "metadata": {
            "category": pdf_path.parent.name,
            "filename": pdf_path.name,
            "word_count": len(clean_text.split()),
            "char_count": len(clean_text)
        }
    }


In [7]:
# Process all PDFs
all_pdfs = list(RAW_DIR.rglob("*.pdf"))
print(f"Found {len(all_pdfs)} PDF files to process\n")

processed_docs = []
failed_docs = []

for pdf_path in all_pdfs:
    result = process_pdf(pdf_path)
    if result:
        processed_docs.append(result)
        
        # Save processed version
        processed_file = PROCESSED_DIR / f"{result['citation_prefix']}.json"
        with open(processed_file, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
        
        # Save metadata
        metadata_file = METADATA_DIR / f"{result['citation_prefix']}_metadata.json"
        metadata = {
            "citation_prefix": result["citation_prefix"],
            "source_file": result["source_file"],
            "total_pages": result["total_pages"],
            "num_sections": len(result["sections"]),
            "num_citations": len(result["citations"]),
            "metadata": result["metadata"]
        }
        with open(metadata_file, 'w', encoding='utf-8') as f:
            json.dump(metadata, f, indent=2, ensure_ascii=False)
    else:
        failed_docs.append(str(pdf_path))

print(f"\n‚úÖ Processed: {len(processed_docs)}")
print(f"‚ùå Failed: {len(failed_docs)}")
if failed_docs:
    print("\nFailed files:")
    for f in failed_docs:
        print(f"  - {f}")


Processing: ics_form_209_incident_status_summary_v3.pdf -> fema_ics209
Processing: ics_form_205a_communications_list_v3.pdf -> fema_ics205
Processing: ics_form_211_incident_check-in_list_v3.1.pdf -> fema_ics211
Processing: ics_form_207_incident_organization_chart_v3.pdf -> fema_ics207
Processing: ics_form_213_general_message_v3.pdf -> fema_ics213
Processing: ics_form_210_resource_status_change_v3.pdf -> fema_ics210
Processing: ics_form_215_operational_planning_worksheet_v3.pdf -> fema_ics215
Processing: ics_form_204_assignment_list_v3.1.pdf -> fema_ics204
Processing: ics_form_201_incident_briefing_v3.pdf -> fema_ics201
Processing: nims-guideline-resource-management-preparedness.pdf -> fema_nims_guideline-resource-management-preparedness
Processing: nims_doctrine.pdf -> fema_nims_doctrine
Processing: nims-incident-complexity-guide.pdf -> fema_nims_incident-complexity-guide
Processing: nrf_finalapproved_2011028.pdf -> fema_nrf
Processing: cert_basic_training_manual.pdf -> cert_basic_trai

In [9]:
# Create master citation index
citation_index = {}
for doc in processed_docs:
    for citation in doc["citations"]:
        citation_index[citation["citation_id"]] = {
            "citation_prefix": doc["citation_prefix"],
            "source_file": doc["source_file"],
            "section_title": citation["section_title"],
            "section_number": citation["section_number"],
            "start_page": citation["start_page"],
            "content_preview": citation["content_preview"]
        }

# Save citation index
index_file = METADATA_DIR / "citation_index.json"
with open(index_file, 'w', encoding='utf-8') as f:
    json.dump(citation_index, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Citation index created with {len(citation_index)} entries")
print(f"üìÅ Saved to: {index_file}")

# Create summary statistics
summary = {
    "total_documents": len(processed_docs),
    "total_pages": sum(doc["total_pages"] for doc in processed_docs),
    "total_citations": len(citation_index),
    "categories": defaultdict(int),
    "word_count": sum(doc["metadata"]["word_count"] for doc in processed_docs)
}

for doc in processed_docs:
    summary["categories"][doc["metadata"]["category"]] += 1

summary_file = METADATA_DIR / "processing_summary.json"
with open(summary_file, 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print(f"\nüìä Summary:")
print(f"  Documents: {summary['total_documents']}")
print(f"  Pages: {summary['total_pages']}")
print(f"  Citations: {summary['total_citations']}")
print(f"  Words: {summary['word_count']:,}")
print(f"  Categories: {dict(summary['categories'])}")


‚úÖ Citation index created with 754 entries
üìÅ Saved to: /Users/ronny/Desktop/ML projects/DisasterOps/Data/metadata/citation_index.json

üìä Summary:
  Documents: 49
  Pages: 2714
  Citations: 754
  Words: 541,509
  Categories: {'ready_gov': 7, 'ics_forms': 21, 'fema_other': 4, 'cert': 1, '04 IG': 4, '06 Handouts': 6, '03 SM': 2, 'Handouts': 1, 'POI': 1, 'Instructor Guide': 1, 'Student Manual': 1}


### RAG PIPELINE ( Chunking + embeddings)

In [11]:
!pip install sentence-transformers rank-bm25 torch

Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25, sentence-transformers
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2/2[0m [sentence-transformers]
[1A[2KSuccessfully installed rank-bm25-0.2.2 sentence-transformers-5.2.0


In [12]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import pickle
from typing import List, Dict, Tuple

# Paths
CHUNKS_DIR = BASE_DIR / "Data" / "chunks"
EMBEDDINGS_DIR = BASE_DIR / "Data" / "embeddings"
CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)

# Load embedding model (using sentence-transformers as default - no API key needed)
print("Loading embedding model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast, good quality, 384 dims
print("‚úì Model loaded")


Loading embedding model...
‚úì Model loaded


In [13]:
def chunk_text(text: str, citation_id: str, chunk_size: int = 500, chunk_overlap: int = 50) -> List[Dict]:
    """Create chunks with overlap and preserve citation mapping"""
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - chunk_overlap):
        chunk_words = words[i:i + chunk_size]
        chunk_text = " ".join(chunk_words)
        
        if chunk_text.strip():
            chunks.append({
                "text": chunk_text,
                "citation_id": citation_id,
                "chunk_index": len(chunks),
                "word_count": len(chunk_words)
            })
    
    return chunks

def process_documents_for_rag(processed_docs: List[Dict]) -> Tuple[List[Dict], Dict]:
    """Create chunks from all processed documents"""
    all_chunks = []
    chunk_to_citation = {}
    
    for doc in processed_docs:
        citation_prefix = doc["citation_prefix"]
        
        # Chunk by sections (hierarchical approach)
        for section in doc["sections"]:
            section_citation = f"{citation_prefix}_section{section.get('section_number', 1)}_p{section.get('page', 1)}"
            section_text = section["content"]
            
            # Further chunk if section is too long
            if len(section_text.split()) > 500:
                section_chunks = chunk_text(section_text, section_citation)
                all_chunks.extend(section_chunks)
            else:
                all_chunks.append({
                    "text": section_text,
                    "citation_id": section_citation,
                    "chunk_index": 0,
                    "word_count": len(section_text.split())
                })
    
    # Build citation mapping
    for i, chunk in enumerate(all_chunks):
        chunk_to_citation[i] = chunk["citation_id"]
    
    return all_chunks, chunk_to_citation


In [14]:
# Load processed documents and create chunks
print("Loading processed documents...")
processed_files = list(PROCESSED_DIR.glob("*.json"))
processed_docs = []
for f in processed_files:
    with open(f, 'r', encoding='utf-8') as file:
        processed_docs.append(json.load(file))

print(f"Loaded {len(processed_docs)} documents")

print("\nCreating chunks...")
all_chunks, chunk_to_citation = process_documents_for_rag(processed_docs)

print(f"‚úì Created {len(all_chunks)} chunks")
print(f"‚úì Average chunk size: {np.mean([c['word_count'] for c in all_chunks]):.0f} words")

# Save chunks
chunks_file = CHUNKS_DIR / "all_chunks.json"
with open(chunks_file, 'w', encoding='utf-8') as f:
    json.dump(all_chunks, f, indent=2, ensure_ascii=False)

citation_map_file = CHUNKS_DIR / "chunk_to_citation.json"
with open(citation_map_file, 'w', encoding='utf-8') as f:
    json.dump(chunk_to_citation, f, indent=2)

print(f"‚úì Saved chunks to {chunks_file}")


Loading processed documents...
Loaded 47 documents

Creating chunks...
‚úì Created 1694 chunks
‚úì Average chunk size: 343 words
‚úì Saved chunks to /Users/ronny/Desktop/ML projects/DisasterOps/Data/chunks/all_chunks.json


In [15]:
# Generate embeddings
print("Generating embeddings...")
chunk_texts = [chunk["text"] for chunk in all_chunks]
embeddings = embedding_model.encode(chunk_texts, show_progress_bar=True, batch_size=32)

print(f"‚úì Generated {len(embeddings)} embeddings")
print(f"‚úì Embedding dimension: {embeddings.shape[1]}")

# Save embeddings
embeddings_file = EMBEDDINGS_DIR / "embeddings.npy"
np.save(embeddings_file, embeddings)
print(f"‚úì Saved embeddings to {embeddings_file}")


Generating embeddings...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

‚úì Generated 1694 embeddings
‚úì Embedding dimension: 384
‚úì Saved embeddings to /Users/ronny/Desktop/ML projects/DisasterOps/Data/embeddings/embeddings.npy


In [None]:
# Build BM25 index (sparse retrieval)
print("Building BM25 index...")
tokenized_chunks = [chunk["text"].lower().split() for chunk in all_chunks]
bm25 = BM25Okapi(tokenized_chunks)

print("‚úì BM25 index built")

# Save BM25 model
bm25_file = EMBEDDINGS_DIR / "bm25_model.pkl"
with open(bm25_file, 'wb') as f:
    pickle.dump(bm25, f)
print(f"‚úì Saved BM25 model to {bm25_file}")


In [None]:
def hybrid_search(query: str, embeddings: np.ndarray, bm25: BM25Okapi, chunk_texts: List[str], 
                  top_k: int = 10, dense_weight: float = 0.5) -> List[Tuple[int, float]]:
    """Hybrid search combining dense (embeddings) and sparse (BM25) retrieval"""
    # Dense search
    query_embedding = embedding_model.encode([query])[0]
    dense_scores = np.dot(embeddings, query_embedding)
    dense_scores = (dense_scores - dense_scores.min()) / (dense_scores.max() - dense_scores.min() + 1e-8)
    
    # Sparse search (BM25)
    tokenized_query = query.lower().split()
    sparse_scores = bm25.get_scores(tokenized_query)
    sparse_scores = (sparse_scores - sparse_scores.min()) / (sparse_scores.max() - sparse_scores.min() + 1e-8)
    
    # Combine scores
    hybrid_scores = dense_weight * dense_scores + (1 - dense_weight) * sparse_scores
    
    # Get top k
    top_indices = np.argsort(hybrid_scores)[::-1][:top_k]
    results = [(int(idx), float(hybrid_scores[idx])) for idx in top_indices]
    
    return results

# Test search
test_query = "How to establish triage area during emergency"
print(f"Testing search with query: '{test_query}'")
results = hybrid_search(test_query, embeddings, bm25, chunk_texts, top_k=5)

print("\nTop 5 results:")
for i, (idx, score) in enumerate(results, 1):
    chunk = all_chunks[idx]
    print(f"\n{i}. Score: {score:.4f}")
    print(f"   Citation: {chunk['citation_id']}")
    print(f"   Text: {chunk['text'][:150]}...")


In [18]:
# Save RAG pipeline metadata
rag_metadata = {
    "total_chunks": len(all_chunks),
    "embedding_dim": embeddings.shape[1],
    "embedding_model": "all-MiniLM-L6-v2",
    "chunk_size": 500,
    "chunk_overlap": 50,
    "files": {
        "chunks": str(chunks_file.relative_to(BASE_DIR)),
        "embeddings": str(embeddings_file.relative_to(BASE_DIR)),
        "bm25": str(bm25_file.relative_to(BASE_DIR)),
        "citation_map": str(citation_map_file.relative_to(BASE_DIR))
    }
}

rag_metadata_file = EMBEDDINGS_DIR / "rag_metadata.json"
with open(rag_metadata_file, 'w', encoding='utf-8') as f:
    json.dump(rag_metadata, f, indent=2)

print(f"\n‚úÖ RAG Pipeline Complete!")
print(f"üìä Summary:")
print(f"   Chunks: {rag_metadata['total_chunks']}")
print(f"   Embeddings: {embeddings.shape}")
print(f"   BM25 index: Built")
print(f"   Metadata saved to: {rag_metadata_file}")



‚úÖ RAG Pipeline Complete!
üìä Summary:
   Chunks: 1694
   Embeddings: (1694, 384)
   BM25 index: Built
   Metadata saved to: /Users/ronny/Desktop/ML projects/DisasterOps/Data/embeddings/rag_metadata.json



### RAG Pipeline

In [19]:
# Test 1: Chunk Quality Statistics
print("=" * 60)
print("TEST 1: Chunk Quality Statistics")
print("=" * 60)

chunk_sizes = [c['word_count'] for c in all_chunks]
print(f"\nChunk Size Statistics:")
print(f"  Total chunks: {len(all_chunks)}")
print(f"  Mean words: {np.mean(chunk_sizes):.1f}")
print(f"  Median words: {np.median(chunk_sizes):.1f}")
print(f"  Min words: {np.min(chunk_sizes)}")
print(f"  Max words: {np.max(chunk_sizes)}")
print(f"  Std dev: {np.std(chunk_sizes):.1f}")

# Check citation integrity
unique_citations = set(c['citation_id'] for c in all_chunks)
print(f"\nCitation Integrity:")
print(f"  Unique citations: {len(unique_citations)}")
print(f"  Chunks with citations: {sum(1 for c in all_chunks if c.get('citation_id'))}/{len(all_chunks)}")

# Sample chunks
print(f"\nSample Chunks (first 3):")
for i, chunk in enumerate(all_chunks[:3]):
    print(f"\n{i+1}. Citation: {chunk['citation_id']}")
    print(f"   Words: {chunk['word_count']}")
    print(f"   Preview: {chunk['text'][:100]}...")


TEST 1: Chunk Quality Statistics

Chunk Size Statistics:
  Total chunks: 1694
  Mean words: 343.0
  Median words: 500.0
  Min words: 1
  Max words: 500
  Std dev: 192.1

Citation Integrity:
  Unique citations: 465
  Chunks with citations: 1694/1694

Sample Chunks (first 3):

1. Citation: fema_ics218_section1_p2
   Words: 500
   Preview: SUPPORT VEHICLE/EQUIPMENT INVENTORY (ICS 218) 1. Incident Name: 2. Incident Number: 3. Date/Time Pre...

2. Citation: fema_ics218_section1_p2
   Words: 159
   Preview: etc. Agency or Owner Enter the name of the agency or owner of the vehicle or equipment. Operator Nam...

3. Citation: 04 IG_is0100c_poi_section1_p1
   Words: 142
   Preview: March 2025 POI - IS-0100.c: An Introduction to the Incident Command System, ICS 100
POI - IS-0100.c:...



TEST 2: Embedding Quality

Embedding Properties:
  Shape: (1694, 384)
  Mean: -0.0003
  Std: 0.0510
  Min: -0.2407
  Max: 0.2387

Semantic Similarity Test:
  Chunk 1: SUPPORT VEHICLE/EQUIPMENT INVENTORY (ICS 218) 1. I...
  Chunk 2: etc. Agency or Owner Enter the name of the agency ...
  Cosine similarity: 0.5661

Duplicate Detection:
  Unique embedding norms (rounded): 1/1694
  Potential duplicates: 1693



TEST 3: Retrieval Quality

Testing queries with expected content:

Query: 'triage area setup'
Expected: Should find CERT/medical response content
Top 3 results:
    1. Score: 0.7367 | cert_basic_training_manual_section1_p26
     Text: have received. VPA protects CERT volunteers during a disaster, and volunteers ma...
    2. Score: 0.6506 | cert_basic_training_manual_section1_p204
     Text: wall/left wall. Keep in mind that every interior space has six sides ‚Äî including...
    3. Score: 0.6359 | Instructor Guide_igis0700b_section1_p15
     Text: ‚Ä¢ Distances between personnel and resources Lesson 3: NIMS Management Characteri...

Query: 'ICS form 201'
Expected: Should find ICS-201 form content
Top 3 results:
    1. Score: 1.0000 | 03 SM_is0200c sm_section1_p132
     Text: Response, ICS 200 Visual 38: Activity 4.2: Using ICS Form 201 Activity Purpose: ...
    2. Score: 0.9100 | 04 IG_is0200c ig_section1_p165
     Text: Incident Command System for Initial Response, ICS 200 Visual 38:

In [24]:
# FIX: Rebuild citation index from chunks
print("=" * 60)
print("FIX: Rebuilding Citation Index from Chunks")
print("=" * 60)

# Build citation index from CHUNKS (not original documents)
chunk_citation_index = {}
for chunk in all_chunks:
    cit_id = chunk['citation_id']
    if cit_id not in chunk_citation_index:
        chunk_citation_index[cit_id] = {
            "citation_id": cit_id,
            "chunk_count": 0,
            "sample_text": chunk['text'][:150] + "..."
        }
    chunk_citation_index[cit_id]["chunk_count"] += 1

# Save updated citation index
updated_index_file = CHUNKS_DIR / "chunk_citation_index.json"
with open(updated_index_file, 'w', encoding='utf-8') as f:
    json.dump(chunk_citation_index, f, indent=2, ensure_ascii=False)

print(f"‚úì Rebuilt citation index: {len(chunk_citation_index)} unique citations")
print(f"‚úì Total chunks covered: {len(all_chunks)}")
print(f"‚úì Saved to: {updated_index_file}")

# Check for content duplicates (actual issue)
unique_texts = len(set(c['text'] for c in all_chunks))
print(f"\nüìä Content Analysis:")
print(f"  Unique text chunks: {unique_texts}/{len(all_chunks)}")
print(f"  Duplicate texts: {len(all_chunks) - unique_texts}")

# Note about embedding norms
print(f"\nüìù Note: 'Potential duplicates: 1693' from Test 2 is a FALSE POSITIVE")
print(f"  Sentence-transformers normalizes embeddings to unit length (norm ‚âà 1.0)")
print(f"  This is expected behavior, not actual duplicates.")


FIX: Rebuilding Citation Index from Chunks
‚úì Rebuilt citation index: 465 unique citations
‚úì Total chunks covered: 1694
‚úì Saved to: /Users/ronny/Desktop/ML projects/DisasterOps/Data/chunks/chunk_citation_index.json

üìä Content Analysis:
  Unique text chunks: 1547/1694
  Duplicate texts: 147

üìù Note: 'Potential duplicates: 1693' from Test 2 is a FALSE POSITIVE
  Sentence-transformers normalizes embeddings to unit length (norm ‚âà 1.0)
  This is expected behavior, not actual duplicates.


### Result Check

In [21]:
from agents import run_pipeline, print_result_summary

result = run_pipeline("Flood in downtown. Water rising. Buildings affected.")
print_result_summary(result)



PIPELINE RESULT SUMMARY

üìã Structured Incident:
{
  "hazards": [
    "flood"
  ],
  "injuries": [],
  "infrastructure_status": [
    "buildings affected"
  ],
  "weather": "water rising",
  "available_responders": [],
  "constraints": []
}

üìö Evidence: 15 chunks retrieved
   Unique citations: 15

üìù Operational Plan:
   Objectives: 4
   Tasks: 5
   Time Horizon: 0-2 hours

üì£ Communications:
   Public Advisory: Attention residents: Due to rising water levels, flooding is expected in our area. Please stay indoors and avoid flooded roads. If you live in a low-l...

‚úÖ Verification:
   Citation Coverage: all_claims_cited
   Confidence Score: 0.90
   Multi-Source Status: validated
   Flagged Issues: 1
   Known Claims: 4
   Unknown Claims: 1


In [22]:
# Or run tests
from agents import run_test_suite
run_test_suite()

COMPREHENSIVE PIPELINE TEST: 5-AGENT SYSTEM
Testing 8 scenarios...

[1/8] Processing: Urban Flooding
--------------------------------------------------------------------------------
  ‚úì Processed in 34.15s
  Hazards: flood
  Evidence: 15 chunks, 15 citations
  Confidence: 0.90
  Citation Coverage: all_claims_cited

[2/8] Processing: Wildfire Smoke
--------------------------------------------------------------------------------
  ‚úì Processed in 42.47s
  Hazards: wildfire, heavy smoke, poor air quality
  Evidence: 15 chunks, 15 citations
  Confidence: 0.90
  Citation Coverage: all_claims_cited

[3/8] Processing: Earthquake with Aftershocks
--------------------------------------------------------------------------------
  ‚úì Processed in 34.52s
  Hazards: earthquake, aftershocks
  Evidence: 15 chunks, 15 citations
  Confidence: 0.90
  Citation Coverage: all_claims_cited

[4/8] Processing: Hurricane Landfall
-----------------------------------------------------------------------------

[{'name': 'Urban Flooding',
  'success': True,
  'time': 34.153101682662964,
  'hazards': ['flood'],
  'evidence': 15,
  'citations': 15,
  'confidence': 0.9,
  'coverage': 'all_claims_cited'},
 {'name': 'Wildfire Smoke',
  'success': True,
  'time': 42.4677939414978,
  'hazards': ['wildfire', 'heavy smoke', 'poor air quality'],
  'evidence': 15,
  'citations': 15,
  'confidence': 0.9,
  'coverage': 'all_claims_cited'},
 {'name': 'Earthquake with Aftershocks',
  'success': True,
  'time': 34.52307915687561,
  'hazards': ['earthquake', 'aftershocks'],
  'evidence': 15,
  'citations': 15,
  'confidence': 0.9,
  'coverage': 'all_claims_cited'},
 {'name': 'Hurricane Landfall',
  'success': True,
  'time': 39.907214879989624,
  'hazards': ['hurricane', 'high winds', 'storm surge'],
  'evidence': 15,
  'citations': 15,
  'confidence': 0.95,
  'coverage': 'all_claims_cited'},
 {'name': 'Chemical Spill',
  'success': True,
  'time': 34.09149718284607,
  'hazards': ['chemical spill'],
  'eviden

In [26]:
from agents import run_pipeline
from output_generation import generate_from_agent_output, export_to_json
from pathlib import Path

# Run pipeline
result = run_pipeline("Flood in downtown area. Multiple buildings affected.")

# Generate outputs (this will use the fixed code)
all_outputs = generate_from_agent_output(
    result,
    incident_name="Urban Flooding",
    incident_number="2024-001"
)

# Export
export_to_json(all_outputs, Path("outputs/my_incident_fixed.json"))

‚úì Exported JSON to outputs/my_incident_fixed.json


In [2]:
!pip install langchain-openai langchain-core langgraph openai

Collecting langchain-openai
  Using cached langchain_openai-1.1.6-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-core
  Using cached langchain_core-1.2.5-py3-none-any.whl.metadata (3.7 kB)
Collecting langgraph
  Using cached langgraph-1.0.5-py3-none-any.whl.metadata (7.4 kB)
Collecting openai
  Using cached openai-2.14.0-py3-none-any.whl.metadata (29 kB)
Collecting tiktoken<1.0.0,>=0.7.0 (from langchain-openai)
  Using cached tiktoken-0.12.0-cp314-cp314-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting jsonpatch<2.0.0,>=1.33.0 (from langchain-core)
  Using cached jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langsmith<1.0.0,>=0.3.45 (from langchain-core)
  Using cached langsmith-0.5.2-py3-none-any.whl.metadata (15 kB)
Collecting pydantic<3.0.0,>=2.7.4 (from langchain-core)
  Using cached pydantic-2.12.5-py3-none-any.whl.metadata (90 kB)
Collecting tenacity!=8.4.0,<10.0.0,>=8.1.0 (from langchain-core)
  Using cached tenacity-9.1.2-py3-none-any.whl.metadata (

In [4]:
!python test_output_generation.py

  from pydantic.v1.fields import FieldInfo as FieldInfoV1

OUTPUT GENERATION MODULE - COMPREHENSIVE TEST SUITE
TEST: Single Scenario Output Generation

Scenario: Urban Flooding
Description: Flood in downtown area. Water level rising. Multiple buildings affected. No injuries reported yet. Rain expected to continue.

Step 1: Running agent pipeline...

‚ùå TEST FAILED

Error: OPENAI_API_KEY environment variable not set
Traceback (most recent call last):
  File "/Users/ronny/Desktop/ML projects/DisasterOps/test_output_generation.py", line 250, in main
    all_outputs, agent_result = test_single_scenario()
                                ~~~~~~~~~~~~~~~~~~~~^^
  File "/Users/ronny/Desktop/ML projects/DisasterOps/test_output_generation.py", line 41, in test_single_scenario
    agent_result = run_pipeline(scenario["description"])
  File "/Users/ronny/Desktop/ML projects/DisasterOps/agents.py", line 598, in run_pipeline
    result = app.invoke(initial_state)
  File "/Users/ronny/Desktop/ML pro