[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Hawksight-AI/semantica/blob/main/cookbook/use_cases/cybersecurity/02_Threat_Intelligence_Hybrid_RAG.ipynb)

# Threat Intelligence Hybrid RAG - Vector + Graph Retrieval

## Overview

This notebook demonstrates **threat intelligence hybrid RAG** using Semantica with focus on **hybrid search**, **vector + graph retrieval**, and **context-aware queries**. The pipeline combines vector search with temporal knowledge graphs for advanced threat intelligence querying.

### Key Features

- **Hybrid RAG**: Combines vector similarity search with knowledge graph traversal
- **Vector + Graph Retrieval**: Uses both vector embeddings and graph relationships
- **Context-Aware Queries**: Provides context-aware retrieval for threat intelligence
- **Temporal Knowledge Graphs**: Builds temporal KGs for threat timeline analysis
- **Multi-hop Reasoning**: Follows relationships across the graph for deeper context

### Pipeline Architecture

1. **Phase 0**: Setup & Configuration
2. **Phase 1**: Threat Intelligence Data Ingestion
3. **Phase 2**: Entity Extraction (IOC, Campaign, Threat, Actor, TTP)
4. **Phase 3**: Temporal Knowledge Graph Construction
5. **Phase 4**: Embedding Generation & Vector Store
6. **Phase 5**: Hybrid Search Setup (Vector + Graph)
7. **Phase 6**: Context-Aware Query System
8. **Phase 7**: Visualization & Export

---

## Installation


In [None]:
%pip install -qU semantica networkx matplotlib plotly pandas faiss-cpu groq sentence-transformers


---

## Phase 0: Setup & Configuration


In [None]:
import os
from semantica.core import Semantica, ConfigManager
from semantica.vector_store import VectorStore
from semantica.context import AgentContext

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY", "your-key")

config_dict = {
    "project_name": "Threat_Intelligence_Hybrid_RAG",
    "embedding": {"provider": "sentence_transformers", "model": "all-MiniLM-L6-v2"},
    "extraction": {"provider": "groq", "model": "llama-3.1-8b-instant"},
    "vector_store": {"provider": "faiss", "dimension": 384},
    "knowledge_graph": {"backend": "networkx", "temporal": True}
}

config = ConfigManager().load_from_dict(config_dict)
core = Semantica(config=config)
vector_store = VectorStore(backend="faiss", dimension=384)
print("Configured for threat intelligence hybrid RAG")


---

## Phase 1: Real Data Ingestion (Security RSS Feeds)

Ingest threat intelligence data from multiple security RSS feeds.


In [None]:
from semantica.ingest import FeedIngestor, FileIngestor
import os

os.makedirs("data", exist_ok=True)

# Ingest from multiple security RSS feeds (real data sources)
security_feeds = [
    "https://www.us-cert.gov/ncas/alerts.xml",  # US-CERT alerts
    # Add more security feeds as needed
]

documents = []
for feed_url in security_feeds:
    try:
        feed_ingestor = FeedIngestor()
        feed_documents = feed_ingestor.ingest(feed_url, method="rss")
        print(f"Ingested {len(feed_documents)} documents from {feed_url}")
        documents.extend(feed_documents)
    except Exception as e:
        print(f"Feed ingestion failed for {feed_url}: {e}")

# Fallback: Sample threat intelligence data
if not documents:
    threat_data = """
    IOC: IP address 192.168.1.50 associated with APT28 campaign.
    Threat actor APT28 uses TTP: Spear phishing and credential harvesting.
    Campaign Operation GhostShell targets financial institutions.
    Malware sample hash: abc123def456 linked to APT28 infrastructure.
    IOC: Domain example-malicious.com linked to APT29 operations.
    """
    with open("data/threat_intel.txt", "w") as f:
        f.write(threat_data)
    documents = FileIngestor().ingest("data/threat_intel.txt")
    print(f"Ingested {len(documents)} documents from sample data")


---

## Phase 2: Text Normalization & Advanced Chunking

Normalize IOC data and use entity-aware/relation-aware chunking for threat relationships.


In [None]:
from semantica.normalize import TextNormalizer
from semantica.split import TextSplitter, EntityAwareChunker, RelationAwareChunker
from semantica.deduplication import DuplicateDetector

# Normalize IOC data and threat intelligence
normalizer = TextNormalizer()
normalized_documents = []
for doc in documents:
    normalized_text = normalizer.normalize(
        doc.content if hasattr(doc, 'content') else str(doc),
        clean_html=True,
        normalize_entities=True,
        remove_extra_whitespace=True
    )
    normalized_documents.append(normalized_text)

print(f"Normalized {len(normalized_documents)} documents")

# Use entity-aware chunking to preserve threat entity boundaries for GraphRAG
entity_splitter = TextSplitter(
    method="entity_aware",
    ner_method="llm",
    chunk_size=1000,
    chunk_overlap=200
)

# Alternative: relation-aware chunking to preserve threat relationship triplets
relation_splitter = TextSplitter(
    method="relation_aware",
    chunk_size=1000,
    chunk_overlap=200
)

# Chunk with entity-aware (preserves IOC, Actor, Campaign boundaries)
chunked_docs = []
for doc_text in normalized_documents:
    chunks = entity_splitter.split(doc_text)
    chunked_docs.extend([chunk.content if hasattr(chunk, 'content') else str(chunk) for chunk in chunks])

print(f"Created {len(chunked_docs)} entity-aware chunks")


---

## Phase 3: Knowledge Graph Construction & Deduplication

Build temporal knowledge graph and deduplicate threat entities.


In [None]:
# Build knowledge base with both vectors and graph
result = core.build_knowledge_base(
    sources=chunked_docs,
    custom_entity_types=["IOC", "Campaign", "Threat", "Actor", "TTP", "Malware"],
    embeddings=True,
    graph=True,
    temporal=True
)

kg = result["knowledge_graph"]
entities = result["entities"]
embeddings_data = result["embeddings"]

# Deduplicate threat entities (IOCs, actors, campaigns)
iocs = [e for e in entities if e.get("type") == "IOC" or "ioc" in e.get("type", "").lower()]
actors = [e for e in entities if e.get("type") == "Actor" or "actor" in e.get("type", "").lower()]

detector = DuplicateDetector()
ioc_duplicates = detector.detect_duplicates(iocs, threshold=0.9)
actor_duplicates = detector.detect_duplicates(actors, threshold=0.9)

deduplicated_iocs = detector.resolve_duplicates(iocs, ioc_duplicates)
deduplicated_actors = detector.resolve_duplicates(actors, actor_duplicates)

print(f"Built hybrid system: {len(kg.get('entities', []))} entities, {len(embeddings_data['vectors'])} vectors")
print(f"Deduplicated: {len(iocs)} -> {len(deduplicated_iocs)} unique IOCs")
print(f"Deduplicated: {len(actors)} -> {len(deduplicated_actors)} unique actors")


---

## Phase 4: Vector Store Population

Store embeddings in vector store for hybrid retrieval.


In [None]:
# Store embeddings in vector store
vector_store.store_vectors(
    vectors=embeddings_data["vectors"],
    metadata=embeddings_data["metadata"]
)

print(f"Stored {len(embeddings_data['vectors'])} embeddings in vector store")
print("Focus: Hybrid RAG, vector + graph retrieval, context-aware queries")


In [None]:
from semantica.context import AgentContext
from semantica.embeddings import EmbeddingGenerator

# Initialize enhanced GraphRAG context
context = AgentContext(vector_store=vector_store, knowledge_graph=kg)

# Example hybrid query
query = "What threats are associated with APT28?"
print(f"Query: {query}\n")

# Generate query embedding
embedding_gen = EmbeddingGenerator(provider="sentence_transformers", model="all-MiniLM-L6-v2")
query_embedding = embedding_gen.generate_embeddings([query])[0]

# Vector search
vector_results = vector_store.search_vectors(query_embedding, k=5)

# Enhanced GraphRAG retrieval with graph expansion
graph_context = context.retrieve(
    query,
    max_results=10,
    use_graph=True,  # Enable graph traversal
    expand_graph=True,  # Expand graph relationships
    include_entities=True,  # Include related entities
    include_relationships=True  # Include relationships
)

print(f"Hybrid RAG results:")
print(f"  - Vector search: {len(vector_results)} matches")
print(f"  - GraphRAG retrieval: {len(graph_context)} context items with graph expansion")
print(f"\nTop GraphRAG results:")
for i, result in enumerate(graph_context[:3], 1):
    print(f"{i}. Score: {result.get('score', 0):.3f}")
    print(f"   Content: {result.get('content', '')[:150]}...")
    if result.get('related_entities'):
        print(f"   Related entities: {len(result['related_entities'])}")
print("\nThis cookbook emphasizes enhanced GraphRAG with entity-aware chunking and deduplication")


---

## Phase 7: Visualization & Summary

Visualize threat intelligence knowledge graph.


In [None]:
from semantica.visualization import KGVisualizer

visualizer = KGVisualizer()
visualizer.visualize(kg, output_path="threat_intelligence_kg.html")

print("Threat intelligence hybrid RAG analysis complete")
print("\n=== Pipeline Summary ===")
print(f"✓ Ingested {len(documents)} documents from security RSS feeds")
print(f"✓ Normalized {len(normalized_documents)} documents")
print(f"✓ Created {len(chunked_docs)} entity-aware chunks")
print(f"✓ Deduplicated IOCs and actors")
print(f"✓ Built temporal KG with {len(kg.get('entities', []))} entities")
print(f"✓ Stored {len(embeddings_data['vectors'])} embeddings")
print(f"✓ Enhanced GraphRAG with graph expansion enabled")
print(f"✓ Emphasizes: Hybrid RAG, entity-aware chunking, vector + graph retrieval, context-aware queries")
