In [2]:
import sys
from pathlib import Path

project_root = Path('/home/khaldoun/prjt_vap')
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")

Project root: /home/khaldoun/prjt_vap


## Etape 1: Verification Neo4j

On verifie que Neo4j est disponible.

In [3]:
from neo4j import GraphDatabase

try:
    driver = GraphDatabase.driver(
        "bolt://localhost:7687",
        auth=("neo4j", "password")
    )
    driver.verify_connectivity()
    print("Neo4j est accessible")
    driver.close()
except Exception as e:
    print(f"Erreur Neo4j: {e}")
    print("\nPour demarrer Neo4j:")
    print("docker run -d -p 7474:7474 -p 7687:7687 -e NEO4J_AUTH=neo4j/password neo4j:latest")

Neo4j est accessible


## Etape 2: Construction du Knowledge Graph

On construit le graphe avec les 1,050 questions et leurs relations.

In [4]:
from src.rag_graph.graph_builder import ISOGraphBuilder

builder = ISOGraphBuilder(
    uri="bolt://localhost:7687",
    user="neo4j",
    password="password"
)

print("Graph builder initialise")

INFO:src.rag_graph.graph_builder:OK - Connected to Neo4j


Graph builder initialise


In [7]:
# Verifier si le graphe existe deja
with builder.driver.session() as session:
    result = session.run("MATCH (n) RETURN count(n) as count")
    count = result.single()['count']
    print(f"Noeuds dans le graphe: {count}")

if count == 0:
    print("\nConstruction du graphe (cela prend 2-3 minutes)...")
    builder.build_graph(method=1)
    print("Graphe construit")
else:
    print("Graphe deja construit")

Noeuds dans le graphe: 1835
Graphe deja construit


## Etape 3: Statistiques du graphe

In [14]:
stats = builder._get_statistics()

print("STATISTIQUES DU KNOWLEDGE GRAPH")
print("="*50)
print(f"Questions: {stats['questions']}")
print(f"Labels: {stats['labels']}")
print(f"Standards: {stats['standards']}")
print(f"Clauses: {stats['clauses']}")
print(f"Relations: {stats['relationships']}")

STATISTIQUES DU KNOWLEDGE GRAPH
Questions: 1050
Labels: 580
Standards: 5
Clauses: 200
Relations: 6556


## Etape 4: Requetes Cypher directes

On teste quelques requetes Cypher pour explorer le graphe.

In [5]:
# Exemple 1: Questions sur backups
cypher_query = """
MATCH (q:Question)-[:HAS_LABEL]->(l:Label)
WHERE l.name = 'backups' OR l.name = 'backup protection' OR l.name = 'backup integrity'
RETURN q.text as question, q.id as id, collect(l.name) as labels
LIMIT 5
"""

with builder.driver.session() as session:
    result = session.run(cypher_query)
    questions = [record for record in result]

print("Questions avec labels backup:\n")
for i, q in enumerate(questions, 1):
    print(f"[{i}] {q['question']}")
    print(f"    ID: {q['id']}")
    print(f"    Labels: {', '.join(q['labels'])}\n")

Questions avec labels backup:

[1] Are all copies of PII (logs, backups, archives) removed at termination?
    ID: doc_1038
    Labels: backups

[2] Is PII securely deleted upon contract termination?
    ID: doc_1036
    Labels: backups

[3] Are all copies of PII deleted upon request, including backups?
    ID: doc_907
    Labels: backups

[4] Are backups tested for restoration integrity?
    ID: doc_284
    Labels: backups, backup protection, backup integrity

[5] Are backups performed according to policy?
    ID: doc_282
    Labels: backups



In [6]:
# Exemple 2: Questions ISO 27001 sur policy
cypher_query = """
MATCH (q:Question)-[:BELONGS_TO_STANDARD]->(s:Standard {name: 'iso_27001'})
MATCH (q)-[:HAS_LABEL]->(l:Label)
WHERE l.name = 'policy' OR l.name = 'policy compliance' OR l.name = 'information security policy'
RETURN q.text as question, collect(l.name) as labels
LIMIT 5
"""

with builder.driver.session() as session:
    result = session.run(cypher_query)
    questions = [record for record in result]

print("Questions ISO 27001 sur policy:\n")
for i, q in enumerate(questions, 1):
    print(f"[{i}] {q['question']}")
    print(f"    Labels: {', '.join(q['labels'])}\n")

Questions ISO 27001 sur policy:

[1] Are changes reviewed and tested before deployment?
    Labels: policy compliance

[2] Are all improvements tracked to closure in a documented process?
    Labels: policy compliance

[3] Are KPIs and KRIs used to measure security performance?
    Labels: policy compliance

[4] Are SIEM alerts triaged according to documented severity levels?
    Labels: policy compliance

[5] Are email filtering and anti-spam mechanisms implemented?
    Labels: policy compliance



In [11]:
# Exemple 3: Labels les plus utilises
cypher_query = """
MATCH (l:Label)<-[:HAS_LABEL]-(q:Question)
RETURN l.name as label, count(q) as usage
ORDER BY usage DESC
LIMIT 10
"""

with builder.driver.session() as session:
    result = session.run(cypher_query)
    labels = [record for record in result]

print("Top 10 labels les plus utilises:\n")
for i, label in enumerate(labels, 1):
    print(f"{i:2d}. {label['label']:30s} ({label['usage']} questions)")

Top 10 labels les plus utilises:

 1. documentation                  (346 questions)
 2. governance                     (277 questions)
 3. risk management                (177 questions)
 4. privacy compliance             (142 questions)
 5. pii processing                 (136 questions)
 6. cloud security                 (134 questions)
 7. monitoring                     (133 questions)
 8. access control                 (123 questions)
 9. encryption                     (111 questions)
10. policy compliance              (110 questions)


## Etape 5: Detection automatique de labels

In [15]:
from src.rag_graph.label_detector import LabelDetector

detector = LabelDetector()

print(f"Label detector initialise")
print(f"Mots-cles mappes: {len(detector.keyword_mapping)}")

INFO:src.rag_graph.label_detector:OK - Label detector initialized


Label detector initialise
Mots-cles mappes: 29


In [16]:
# Test 1: Detection simple
query1 = "Questions sur les sauvegardes de donnees"
labels = detector.detect_labels(query1)
standard = detector.detect_standard(query1)

print(f"Requete: '{query1}'")
print(f"Labels detectes: {labels}")
print(f"Standard: {standard if standard else 'Non specifie'}")

INFO:src.rag_graph.label_detector:Detected labels: {'backup'}


Requete: 'Questions sur les sauvegardes de donnees'
Labels detectes: ['backup']
Standard: Non specifie


In [17]:
# Test 2: Detection multiple
query2 = "Politique de securite et formation des employes ISO 27001"
labels = detector.detect_labels(query2)
standard = detector.detect_standard(query2)

print(f"Requete: '{query2}'")
print(f"Labels detectes: {labels}")
print(f"Standard: {standard}")

INFO:src.rag_graph.label_detector:Detected labels: {'training', 'policy'}
INFO:src.rag_graph.label_detector:Detected standard: iso 27001


Requete: 'Politique de securite et formation des employes ISO 27001'
Labels detectes: ['training', 'policy']
Standard: iso_27001


## Etape 6: Generation de requetes Cypher

In [18]:
from src.rag_graph.cypher_generator import CypherQueryGenerator

cypher_gen = CypherQueryGenerator()

print("Cypher generator initialise")

INFO:src.rag_graph.cypher_generator:OK - Cypher query generator initialized


Cypher generator initialise


In [19]:
# Generer une requete
query = "Questions sur les backups"
labels = detector.detect_labels(query)

cypher_query = cypher_gen.generate_label_query(
    labels=labels,
    limit=5
)

print(f"Requete: {query}")
print(f"Labels: {labels}\n")
print("Requete Cypher generee:")
print(cypher_query)

INFO:src.rag_graph.label_detector:Detected labels: {'backup'}


Requete: Questions sur les backups
Labels: ['backup']

Requete Cypher generee:
MATCH (q:Question)-[:HAS_LABEL]->(l:Label)
        WHERE (l.name = 'backup')
        WITH q, collect(DISTINCT l.name) as labels
        MATCH (q)-[:BELONGS_TO_STANDARD]->(s:Standard)
        
        MATCH (q)-[:BELONGS_TO_CLAUSE]->(c:Clause)
        
        RETURN q.id as id, 
               q.text as text, 
               s.name as standard,
               c.title as clause,
               labels
        ORDER BY size(labels) DESC
        LIMIT 5


## Etape 7: Graph Retriever

On utilise le retriever pour chercher dans le graphe.

In [20]:
from src.rag_graph.graph_retriever import GraphRetriever

retriever = GraphRetriever(
    uri="bolt://localhost:7687",
    user="neo4j",
    password="password"
)

print("Graph retriever initialise")

INFO:src.rag_graph.graph_retriever:OK - Connected to Neo4j
INFO:src.rag_graph.label_detector:OK - Label detector initialized
INFO:src.rag_graph.cypher_generator:OK - Cypher query generator initialized


Graph retriever initialise


In [21]:
# Test 1: Recherche simple
query = "Backup and disaster recovery"
results = retriever.retrieve(query, top_k=5)

print(f"Requete: '{query}'")
print(f"Resultats: {len(results)}\n")

for i, doc in enumerate(results, 1):
    print(f"[{i}] {doc['content']}")
    print(f"    Standard: {doc['metadata']['iso_standard']}")
    print(f"    Labels: {doc['metadata']['labels'][:50]}...\n")

INFO:src.rag_graph.graph_retriever:Retrieving questions for: 'Backup and disaster recovery'
INFO:src.rag_graph.label_detector:Detected labels: {'restore', 'backup', 'disaster recovery'}
INFO:src.rag_graph.label_detector:Extracted context: {'labels': ['restore', 'backup', 'disaster recovery'], 'standard': None, 'clause': None, 'query': 'Backup and disaster recovery'}
INFO:src.rag_graph.graph_retriever:Context: labels=['restore', 'backup', 'disaster recovery'], standard=None, clause=None
INFO:src.rag_graph.graph_retriever:Retrieved 0 questions


Requete: 'Backup and disaster recovery'
Resultats: 0



In [22]:
# Test 2: Recherche avec contexte
query = "Security policy and employee training ISO 27001"
results = retriever.retrieve(query, top_k=5)

print(f"Requete: '{query}'")
print(f"Resultats: {len(results)}\n")

for i, doc in enumerate(results, 1):
    print(f"[{i}] {doc['content'][:100]}...")
    print(f"    Clause: {doc['metadata']['title']}\n")

INFO:src.rag_graph.graph_retriever:Retrieving questions for: 'Security policy and employee training ISO 27001'
INFO:src.rag_graph.label_detector:Detected labels: {'employee', 'security policy', 'training', 'policy'}
INFO:src.rag_graph.label_detector:Detected standard: iso 27001
INFO:src.rag_graph.label_detector:Extracted context: {'labels': ['employee', 'security policy', 'training', 'policy'], 'standard': 'iso_27001', 'clause': None, 'query': 'Security policy and employee training ISO 27001'}
INFO:src.rag_graph.graph_retriever:Context: labels=['employee', 'security policy', 'training', 'policy'], standard=iso_27001, clause=None
INFO:src.rag_graph.graph_retriever:Retrieved 5 questions


Requete: 'Security policy and employee training ISO 27001'
Resultats: 5

[1] Are improvement opportunities documented?...
    Clause: Clause 10 – Improvement

[2] Are competence requirements defined for each ISMS role?...
    Clause: Clause 7 – Support

[3] Are resources required for the ISMS available?...
    Clause: Clause 7 – Support

[4] Are changes reviewed and tested before deployment?...
    Clause: A.12 – Operations Security

[5] Are ISMS objectives measurable and monitored?...
    Clause: Clause 5 – Leadership



## Etape 8: Generation avec LLM

In [23]:
from src.llm.llm_interface import OllamaLLM

llm = OllamaLLM(model="mistral", temperature=0.3)

print("LLM Ollama initialise avec Mistral")

INFO:src.llm.llm_interface:Initialized Ollama LLM with model: mistral


LLM Ollama initialise avec Mistral


In [24]:
# Generer un questionnaire
query = "Backup procedures and disaster recovery"

# Recuperer via le graphe
relevant_docs = retriever.retrieve(query, top_k=8)

print(f"Documents recuperes: {len(relevant_docs)}")

# Generer
response = llm.generate_with_context(
    query=f"Create an audit questionnaire about: {query}",
    context_docs=relevant_docs
)

print("\n" + "="*80)
print("QUESTIONNAIRE GENERE")
print("="*80)
print(f"\nRequete: {query}\n")
print(response)
print("\n" + "="*80)

INFO:src.rag_graph.graph_retriever:Retrieving questions for: 'Backup procedures and disaster recovery'
INFO:src.rag_graph.label_detector:Detected labels: {'restore', 'backup', 'disaster recovery', 'policy'}
INFO:src.rag_graph.label_detector:Extracted context: {'labels': ['restore', 'backup', 'disaster recovery', 'policy'], 'standard': None, 'clause': None, 'query': 'Backup procedures and disaster recovery'}
INFO:src.rag_graph.graph_retriever:Context: labels=['restore', 'backup', 'disaster recovery', 'policy'], standard=None, clause=None
INFO:src.rag_graph.graph_retriever:Retrieved 8 questions


Documents recuperes: 8


INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"



QUESTIONNAIRE GENERE

Requete: Backup procedures and disaster recovery

 Title: ISO-compliant Audit Questionnaire on Backup Procedures and Disaster Recovery

Standard: iso_27001, iso_27002

[Question 1]
Clause: Clause 6.1.3 - Information Security Policy
Question: Is there a documented policy for backup procedures and disaster recovery?
Labels: backup procedures, disaster recovery, information security policy, documentation, governance, risk management, business continuity, protection, security controls, documentation

[Question 2]
Clause: Clause 6.1.4 - Information Security Awareness and Training
Question: Are staff trained on backup procedures and disaster recovery processes?
Labels: training, awareness, staff, backup procedures, disaster recovery, risk management, governance, policy compliance

[Question 3]
Standard: iso_27002
Clause: 11.4 Supporting Utilities
Question: Are backup power supplies (UPS, generator) protected during a disaster?
Labels: critical systems, unauthorized acc

In [25]:
# Test 2: Autre sujet
query2 = "Security policy and access control ISO 27001"

relevant_docs = retriever.retrieve(query2, top_k=8)

response = llm.generate_with_context(
    query=f"Create a detailed audit questionnaire about: {query2}",
    context_docs=relevant_docs
)

print("\n" + "="*80)
print("TEST 2")
print("="*80)
print(f"\nRequete: {query2}\n")
print(response)
print("\n" + "="*80)

INFO:src.rag_graph.graph_retriever:Retrieving questions for: 'Security policy and access control ISO 27001'
INFO:src.rag_graph.label_detector:Detected labels: {'security policy', 'access control', 'policy'}
INFO:src.rag_graph.label_detector:Detected standard: iso 27001
INFO:src.rag_graph.label_detector:Extracted context: {'labels': ['security policy', 'access control', 'policy'], 'standard': 'iso_27001', 'clause': None, 'query': 'Security policy and access control ISO 27001'}
INFO:src.rag_graph.graph_retriever:Context: labels=['security policy', 'access control', 'policy'], standard=iso_27001, clause=None
INFO:src.rag_graph.graph_retriever:Retrieved 8 questions
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"



TEST 2

Requete: Security policy and access control ISO 27001

 Title: Detailed Audit Questionnaire - Security Policy and Access Control (ISO 27001)

Standard: ISO 27001

1. Clause 4 – Risk Assessment
   - Are security risks associated with the organization's information assets identified, assessed, and evaluated?
   - Are risk treatment measures defined for identified risks?

2. Clause 5 – Leadership
   - Are ISMS objectives related to security policy and access control measurable and monitored?
   - Is management commitment to the ISMS demonstrated through documented policies and procedures?

3. Clause 6 – Planning
   - Are ISMS objectives related to security policy and access control established, implemented, and maintained?
   - Are ISMS objectives regularly reviewed and updated as necessary?

4. Clause 7 – Support
   - Are competence requirements defined for personnel involved in the management of security policy and access control?
   - Are resources required for the effective i

## Etape 9: Questions liees (Multi-hop)

On explore les relations entre questions.

In [26]:
# Trouver des questions liees
question_id = "doc_001"

related = retriever.get_related_questions(
    question_id=question_id,
    top_k=5
)

print(f"Questions liees a '{question_id}':\n")
for i, doc in enumerate(related, 1):
    print(f"[{i}] {doc['content'][:80]}...")
    print(f"    Standard: {doc['metadata']['iso_standard']}\n")

Questions liees a 'doc_001':

[1] Have you identified external issues that affect your ISMS?...
    Standard: iso_27001

[2] Has the organization identified ISMS-related risks?...
    Standard: iso_27001



## Statistiques finales

In [27]:
# Statistiques du graphe
with builder.driver.session() as session:
    # Compter par type
    result = session.run("""
        MATCH (q:Question) WITH count(q) as questions
        MATCH (l:Label) WITH questions, count(l) as labels
        MATCH (s:Standard) WITH questions, labels, count(s) as standards
        MATCH (c:Clause) WITH questions, labels, standards, count(c) as clauses
        MATCH ()-[r]->() WITH questions, labels, standards, clauses, count(r) as rels
        RETURN questions, labels, standards, clauses, rels
    """)
    stats = result.single()

print("STATISTIQUES FINALES")
print("="*50)
print(f"Questions: {stats['questions']}")
print(f"Labels: {stats['labels']}")
print(f"Standards: {stats['standards']}")
print(f"Clauses: {stats['clauses']}")
print(f"Relations: {stats['rels']}")

# Fermer les connexions
retriever.close()
builder.close()
print("\nConnexions fermees")

INFO:src.rag_graph.graph_builder:Closed Neo4j connection


STATISTIQUES FINALES
Questions: 1050
Labels: 580
Standards: 5
Clauses: 200
Relations: 6556

Connexions fermees


## Conclusion

La methode knowledge graph offre:
- Recherche relationnelle puissante
- Navigation multi-hop dans le graphe
- Requetes complexes avec plusieurs criteres
- Exploration des relations entre concepts

**Temps d'execution:** ~30-40 secondes apres construction du graphe

**Avantages vs methode vectorielle:**
- Meilleure pour les requetes complexes
- Comprend les relations entre concepts
- Permet la navigation contextuelle