In [1]:
import sys
from pathlib import Path

# Ajouter le repertoire racine au path
project_root = Path('/home/khaldoun/prjt_vap')
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Python path configured")

Project root: /home/khaldoun/prjt_vap
Python path configured


## Etape 1: Chargement des donnees

On charge les 1,050 questions ISO depuis les fichiers CSV.

In [2]:
from src.utils.data_loader import ISODataLoader

loader = ISODataLoader()
data = loader.load_method_data(method=1)

print(f"Nombre de questions chargees: {len(data)}")
print(f"Standards: {data['iso_standard'].unique().tolist()}")
print(f"\nPremiere question:")
print(data.iloc[0]['text'])

INFO:src.utils.data_loader:Loaded 200 records from labeled_our_iso_27001.csv
INFO:src.utils.data_loader:Loaded 250 records from labeled_our_iso_27002.csv
INFO:src.utils.data_loader:Loaded 200 records from labeled_our_iso_27017.csv
INFO:src.utils.data_loader:Loaded 200 records from labeled_our_iso_27018.csv
INFO:src.utils.data_loader:Loaded 200 records from labeled_our_iso_27701.csv
INFO:src.utils.data_loader:Loaded total of 1050 records from method 1


Nombre de questions chargees: 1050
Standards: ['iso_27001', 'iso_27002', 'iso_27017', 'iso_27018', 'iso_27701']

Premiere question:
Have you identified internal issues that affect your ISMS?


## Etape 2: Generation des embeddings

On utilise le modele all-MiniLM-L6-v2 pour generer des embeddings de 384 dimensions.

In [3]:
from src.rag_traditional.embeddings import EmbeddingGenerator

embedding_gen = EmbeddingGenerator(model_name="all-MiniLM-L6-v2")

print(f"Modele: {embedding_gen.model_name}")
print(f"Dimension: {embedding_gen.get_embedding_dimension()}")

INFO:src.rag_traditional.embeddings:Loading embedding model: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:httpx:HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config_sentence_transformers.json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Reque

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
INFO:httpx:HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config.json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer_config.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer_config.json "HT

Modele: all-MiniLM-L6-v2
Dimension: 384


In [4]:
# Test sur quelques exemples
test_questions = [
    "Are backup procedures documented?",
    "Is there a disaster recovery plan?",
    "Have you identified internal issues?"
]

embeddings = embedding_gen.embed_batch(test_questions, show_progress=True)
print(f"\nShape des embeddings: {embeddings.shape}")

# Calcul de similarite
sim1 = embedding_gen.compute_similarity(embeddings[0], embeddings[1])
sim2 = embedding_gen.compute_similarity(embeddings[0], embeddings[2])

print(f"\nSimilarite 'backup' vs 'disaster recovery': {sim1:.4f}")
print(f"Similarite 'backup' vs 'internal issues': {sim2:.4f}")

INFO:src.rag_traditional.embeddings:Generating embeddings for 3 texts


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:src.rag_traditional.embeddings:Generated embeddings with shape: (3, 384)



Shape des embeddings: (3, 384)

Similarite 'backup' vs 'disaster recovery': 0.3078
Similarite 'backup' vs 'internal issues': 0.0757


## Etape 3: Creation du vector store

On stocke les embeddings dans ChromaDB pour la recherche.

In [13]:
from src.rag_traditional.vector_store import VectorStore

vector_store = VectorStore(
    persist_directory="/home/khaldoun/prjt_vap/chroma_db",
    collection_name="iso_questions_method_1"
)

vector_store.create_collection(reset=False)
stats = vector_store.get_collection_stats()

print(f"Collection: {stats['collection_name']}")
print(f"Documents: {stats['document_count']}")

INFO:src.rag_traditional.vector_store:Vector store initialized at /home/khaldoun/prjt_vap/chroma_db
INFO:src.rag_traditional.vector_store:Collection 'iso_questions_method_1' ready


Collection: iso_questions_method_1
Documents: 1050


In [14]:
# Si la collection est vide, on l'indexe
if stats['document_count'] == 0:
    print("Indexation des documents...")
    documents = loader.get_documents_for_rag(method=1)
    
    # Generer les embeddings
    texts = [doc['content'] for doc in documents]
    embeddings = embedding_gen.embed_batch(texts, show_progress=True)
    
    # Ajouter au vector store
    for i, doc in enumerate(documents):
        doc['embedding'] = embeddings[i]
    
    vector_store.add_documents(documents)
    print(f"\nIndexation terminee: {len(documents)} documents")
else:
    print("Collection deja indexee")

Collection deja indexee


## Etape 4: Recherche semantique

On teste la recherche par similarite.

In [15]:
from src.rag_traditional.retriever import SemanticRetriever

retriever = SemanticRetriever(
    vector_store=vector_store,
    embedding_generator=embedding_gen
)

print("Retriever initialise")

INFO:src.rag_traditional.retriever:Semantic retriever initialized


Retriever initialise


In [16]:
# Test 1: Recherche simple
query = "Questions about data backups and recovery"
results = retriever.retrieve(query, top_k=5)

print(f"Requete: '{query}'")
print(f"Resultats trouves: {len(results)}\n")

for i, doc in enumerate(results, 1):
    print(f"[{i}] Score: {doc['score']:.4f}")
    print(f"    {doc['content']}")
    print(f"    Standard: {doc['metadata']['iso_standard']}")
    print()

INFO:src.rag_traditional.retriever:Retrieving documents for query: 'Questions about data backups and recovery'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:src.rag_traditional.retriever:Retrieved 5 documents


Requete: 'Questions about data backups and recovery'
Resultats trouves: 5

[1] Score: 0.7733
    Are backups of deleted data handled appropriately?
    Standard: iso_27017

[2] Score: 0.7067
    Are backups performed according to policy?
    Standard: iso_27002

[3] Score: 0.7054
    Are backups tested for restoration integrity?
    Standard: iso_27002

[4] Score: 0.6856
    Are backups performed according to schedule?
    Standard: iso_27001

[5] Score: 0.6814
    Are cloud recovery procedures documented?
    Standard: iso_27017



In [18]:
# Test 2: Recherche avec filtre
query2 = "Security policy and risk management"
results2 = retriever.retrieve(
    query=query2,
    filter_standard="iso_27001",
    top_k=5
)

print(f"Requete: '{query2}' (filtre: ISO 27001)")
print(f"Resultats: {len(results2)}\n")

for i, doc in enumerate(results2, 1):
    print(f"[{i}] {doc['content'][:80]}...")
    print(f"    Clause: {doc['metadata']['title']}")
    print()

INFO:src.rag_traditional.retriever:Retrieving documents for query: 'Security policy and risk management'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:src.rag_traditional.retriever:Retrieved 5 documents


Requete: 'Security policy and risk management' (filtre: ISO 27001)
Resultats: 5

[1] Are information security policies approved by management?...
    Clause: A.5 – Information Security Policies

[2] Are employees aware of security policies?...
    Clause: Clause 7 – Support

[3] Are information security policies communicated to all staff?...
    Clause: A.5 – Information Security Policies

[4] Are information security policies documented and maintained?...
    Clause: A.5 – Information Security Policies

[5] Is there a documented information security policy?...
    Clause: Clause 5 – Leadership



## Etape 5: Generation avec LLM

On utilise Ollama avec Mistral pour generer des reponses.

In [7]:
from src.llm.llm_interface import LLMFactory

llm = LLMFactory.create_llm(
    provider="ollama",
    model="mistral",
    temperature=0.3
)

print("LLM Ollama initialise avec Mistral")

INFO:src.llm.llm_interface:Initialized Ollama LLM with model: mistral


LLM Ollama initialise avec Mistral


In [8]:
# Test de generation simple
test_response = llm.generate("Dis bonjour en francais")
print(f"Test LLM: {test_response}")

INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


Test LLM:  Bonjour signifie "hello" en français.


## Etape 6: Systeme RAG complet

On teste le systeme complet de bout en bout.

In [9]:
from src.rag_traditional.query_handler import ISORAGSystem

rag_system = ISORAGSystem(
    data_method=1,
    embedding_model="all-MiniLM-L6-v2",
    llm_provider="ollama",
    llm_model="mistral",
    rebuild_index=False
)

print("Systeme RAG complet initialise")

INFO:src.rag_traditional.query_handler:Initializing ISO RAG System...
INFO:src.rag_traditional.query_handler:Loading data...
INFO:src.rag_traditional.query_handler:Initializing embedding model...
INFO:src.rag_traditional.embeddings:Loading embedding model: all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:httpx:HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/modules.json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
INFO:httpx:HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/config.json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/tokenizer_config.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/sentence-transformers/all-MiniLM-L6-v2/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/tokenizer_config.json "HT

Systeme RAG complet initialise


In [19]:
# Test de generation de questionnaire
query = "Backups and disaster recovery procedures"

result = rag_system.query(
    user_query=query,
    top_k=5
)

print("="*80)
print("RESULTAT GENERATION RAG")
print("="*80)
print(f"\nRequete: {query}\n")
print(f"Documents recuperes: {result['num_sources']}")
print(f"\nReponse generee:\n")
print(result['answer'])
print("\n" + "="*80)

INFO:src.rag_traditional.query_handler:Processing query: 'Backups and disaster recovery procedures'
INFO:src.rag_traditional.retriever:Retrieving documents for query: 'Backups and disaster recovery procedures'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:src.rag_traditional.retriever:Retrieved 5 documents
INFO:src.rag_traditional.query_handler:Retrieved 5 relevant documents
INFO:src.rag_traditional.query_handler:Generating response with LLM...
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


RESULTAT GENERATION RAG

Requete: Backups and disaster recovery procedures

Documents recuperes: 5

Reponse generee:

 Title: Backup and Disaster Recovery Procedures Questionnaire (Based on ISO 27002 and ISO 27017)

1. Are disaster recovery procedures automated where possible? (ISO 27002, Clause: Extended – Continuity & Resilience, Labels: automation, procedures, documentation, updates)

2. Are backups performed according to policy? (ISO 27002, Clause: 12.3 Backup, Labels: backups, policy compliance)

3. Are backups tested for restoration integrity? (ISO 27002, Clause: 12.3 Backup, Labels: backup protection, backup integrity, resilience, backups, policy compliance)

4. Are backups of deleted data handled appropriately? (ISO 27017, Clause: 27017 – Cloud Data Deletion, Labels: cloud security, governance, compliance, risk management)

5. Is cloud Business Continuity Plan (BCP) aligned with CSP disaster recovery capabilities? (ISO 27017, Clause: 27017 – Cloud Continuity & Availability, Lab

In [21]:
# Test 2: Generation avec un autre sujet
query2 = "Security policy and employee training"

result2 = rag_system.query(
    user_query=query2,
    top_k=5
)

print("="*80)
print("TEST 2")
print("="*80)
print(f"\nRequete: {query2}\n")
print(result2['answer'])
print("\n" + "="*80)

INFO:src.rag_traditional.query_handler:Processing query: 'Security policy and employee training'
INFO:src.rag_traditional.retriever:Retrieving documents for query: 'Security policy and employee training'


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:src.rag_traditional.retriever:Retrieved 5 documents
INFO:src.rag_traditional.query_handler:Retrieved 5 relevant documents
INFO:src.rag_traditional.query_handler:Generating response with LLM...
INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"


TEST 2

Requete: Security policy and employee training

 Title: ISO 27001 Security Policy and Employee Training Questionnaire

1. Standard: ISO_27001 (Clause A.7 – Human Resource Security)
   - Are security policies provided to new employees during onboarding?
   - Labels: employee awareness, security policies, security training

2. Standard: ISO_27001 (Clause 7 – Support)
   - Are employees aware of the organization's security policies?
   - Labels: employee awareness, security policies, security training, security responsibilities

3. Standard: ISO_27001 (Clause 7 – Support)
   - Is annual security awareness training conducted for employees?
   - Labels: documentation, monitoring, risk management, policy compliance

4. Standard: ISO_27001 (Clause 7 – Support)
   - Do employees understand their security responsibilities as outlined in the organization's policies?
   - Labels: employee awareness, security policies, security training, security responsibilities

5. Standard: ISO_27002 (C

## Statistiques finales

In [22]:
stats = rag_system.get_statistics()

print("STATISTIQUES DU SYSTEME RAG VECTORIEL")
print("="*50)
print(f"Methode de donnees: {rag_system.data_method}")
print(f"Documents indexes: {stats['vector_store']['document_count']}")
print(f"Modele d'embedding: {stats['embedding_model']}")
print(f"Dimension: {stats['embedding_dimension']}")
print(f"Provider LLM: Ollama")
print(f"Modele LLM: Mistral")

INFO:src.utils.data_loader:Loaded 200 records from labeled_our_iso_27001.csv
INFO:src.utils.data_loader:Loaded 250 records from labeled_our_iso_27002.csv
INFO:src.utils.data_loader:Loaded 200 records from labeled_our_iso_27017.csv
INFO:src.utils.data_loader:Loaded 200 records from labeled_our_iso_27018.csv
INFO:src.utils.data_loader:Loaded 200 records from labeled_our_iso_27701.csv
INFO:src.utils.data_loader:Loaded total of 1050 records from method 1


STATISTIQUES DU SYSTEME RAG VECTORIEL
Methode de donnees: 1
Documents indexes: 1050
Modele d'embedding: all-MiniLM-L6-v2
Dimension: 384
Provider LLM: Ollama
Modele LLM: Mistral


## Conclusion

La methode vectorielle fonctionne bien pour:
- Recherche semantique rapide
- Requetes simples
- Bonne precision sur les similarites

**Temps d'execution:** ~20-30 secondes apres indexation initiale