In [None]:
# !pip install langchain ollama langchain-ollama faiss-cpu pypdf python-docx markdown beautifulsoup4 pandas

In [1]:
import os
import json
import faiss
import numpy as np
# from langchain.embeddings import OllamaEmbeddings
from langchain_ollama import OllamaEmbeddings 
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from bs4 import BeautifulSoup
from docx import Document as DocxDocument

In [2]:
embedding_model = OllamaEmbeddings(model="nomic-embed-text")  # Change to preferred model

In [48]:
sample_text = "This is a test embedding"
len(embedding_model.embed_query(sample_text))

768

In [3]:
# Directory to store FAISS index
DB_PATH = "vector_db"

In [4]:
def process_text(text, source):
    """Splits text into chunks and converts them into LangChain Documents."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = text_splitter.split_text(text)
    
    return [Document(page_content=chunk, metadata={"source": source}) for chunk in chunks]

In [5]:
def load_pdf(file_path):
    """Extracts text from a PDF and converts it into embeddings."""
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    text = "\n".join([page.page_content for page in pages])
    return process_text(text, file_path)

In [6]:
def load_docx(file_path):
    """Extracts text (including tables) from a DOCX file and converts it into embeddings."""
    doc = DocxDocument(file_path)
    
    text = []
    
    # Extract paragraphs
    for para in doc.paragraphs:
        if para.text.strip():  # Ignore empty lines
            text.append(para.text.strip())
    
    # Extract tables
    for table in doc.tables:
        for row in table.rows:
            row_text = [cell.text.strip() for cell in row.cells if cell.text.strip()]
            if row_text:
                text.append(" | ".join(row_text))  # Format as a readable string
    
    full_text = "\n".join(text)
    
    if not full_text.strip():
        print("⚠️ No text extracted from the DOCX file. It might be empty or contain only images.")
        return []
    
    return process_text(full_text, file_path)

In [7]:
def load_json(file_path):
    """Extracts text from a JSON file and converts it into embeddings."""
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    text = json.dumps(data, indent=2)  # Convert JSON to string
    return process_text(text, file_path)

In [8]:
def load_md(file_path):
    """Extracts text from a Markdown file and converts it into embeddings."""
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    return process_text(text, file_path)

In [9]:
def load_html(file_path):
    """Extracts text from an HTML file and converts it into embeddings."""
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    
    text = soup.get_text()
    return process_text(text, file_path)

In [17]:
def create_or_load_db():
    """Creates or loads an existing FAISS vector database with safe deserialization."""
    if os.path.exists(DB_PATH):
        return FAISS.load_local(DB_PATH, embedding_model, allow_dangerous_deserialization=True)  # ✅ Fix applied

    return None  # Return None if DB doesn't exist yet

In [40]:
import hashlib

def compute_document_hash(text):
    """Computes a unique hash for a document based on its content."""
    return hashlib.md5(text.encode()).hexdigest()  # MD5 hash of the document text

def add_to_db(docs):
    """Adds documents to the FAISS vector store only if they are not already stored."""
    if not docs:
        print("⚠️ No documents to add to the database.")
        return

    db = create_or_load_db()
    
    if db is None:
        print("ℹ️ No existing database found. Creating a new one.")
        db = FAISS.from_documents(docs, embedding_model)
        db.save_local(DB_PATH)
        print(f"✅ Indexed {len(docs)} new chunks.")
        return

    # Retrieve existing document hashes
    existing_hashes = set(
        doc.metadata.get("hash") for doc in db.docstore._dict.values() if "hash" in doc.metadata
    )

    # Filter out duplicate documents
    new_docs = []
    for doc in docs:
        doc_hash = compute_document_hash(doc.page_content)
        if doc_hash not in existing_hashes:
            doc.metadata["hash"] = doc_hash  # Store the hash in metadata
            new_docs.append(doc)
        # else:
        #     print(f"⚠️ Skipping duplicate document: {doc.metadata.get('source', 'Unknown')}")

    if new_docs:
        db.add_documents(new_docs)
        db.save_local(DB_PATH)
        print(f"✅ Indexed {len(new_docs)} new chunks.")
    else:
        print("ℹ️ No new documents to add.")

In [12]:
def query_db(query, k=3):
    """Retrieves the most relevant paragraphs from the database."""
    db = create_or_load_db()
    results = db.similarity_search(query, k=k)
    
    for i, doc in enumerate(results):
        print(f"\n🔹 Result {i+1} (Source: {doc.metadata['source']}):\n{doc.page_content}")
    
    return results

In [13]:
def process_and_store(file_path):
    """Determines the file type and processes it accordingly."""
    ext = file_path.split(".")[-1].lower()
    
    if ext == "pdf":
        docs = load_pdf(file_path)
    elif ext == "docx":
        docs = load_docx(file_path)
    elif ext == "json":
        docs = load_json(file_path)
    elif ext == "md":
        docs = load_md(file_path)
    elif ext == "html":
        docs = load_html(file_path)
    else:
        print("❌ Unsupported file format!")
        return
    
    add_to_db(docs)
    print(f"✅ Successfully indexed {len(docs)} chunks from {file_path}")

In [14]:
def retrieve_relevant_paragraphs(query, k=3):
    """Retrieves the most relevant paragraphs from the database."""
    db = create_or_load_db()
    
    if db is None:
        return "⚠️ No database found. Please add documents first."
    
    results = db.similarity_search(query, k=k)
    
    if not results:
        return "⚠️ No relevant results found in the database."
    
    context = "\n\n".join([f"Source: {doc.metadata['source']}\n{doc.page_content}" for doc in results])
    
    return context

In [15]:
def chat_with_rag(query):
    """Retrieves relevant paragraphs and generates an answer using Llama 3.2."""
    context = retrieve_relevant_paragraphs(query)
    
    if "⚠️" in context:
        return context  # Return warning message if no context is found
    
    prompt = f"""You are a helpful assistant using retrieval-augmented generation (RAG). Answer the following question based on the retrieved documents.

    Context:
    {context}

    Question: {query}

    Answer:"""
    
    response = llm.invoke(prompt)
    
    return response

In [22]:
from langchain_community.llms import Ollama

# Initialize Llama 3.2 via Ollama
llm = Ollama(model="llama3")

Example:

In [23]:
process_and_store("ISACA Risk interview responses.docx")

✅ Successfully indexed 17 chunks from ISACA Risk interview responses.docx


In [24]:
chat_with_rag("What is the main topic of the document?")

'According to the provided document, "ISACA Risk interview responses. docx", the main topic of the document is the upcoming presentation at the ISACA Risk Event titled "The Double-Edged Sword: Risks of AI Language Models in Cybersecurity".'

In [26]:
query = "Who will give the presentation?"
print(chat_with_rag(query))

Based on the retrieved documents (ISACA Risk interview responses), I found that the answer to your question is:

The presentation will be given by ISACA.

Additionally, according to the retrieved documents, some practical steps businesses can take to prevent becoming too dependent on AI and complement AI solutions with human expertise are:

1. Establish clear governance and oversight: Define roles and responsibilities for AI deployment and ensure accountability.
2. Develop diverse AI training data: Incorporate a wide range of scenarios, cases, and perspectives to reduce biases and improve AI decision-making.
3. Implement transparent AI decision-making processes: Ensure AI-driven decisions are explainable, auditable, and subject to human review.
4. Foster collaboration between humans and AI: Encourage joint decision-making and continuous learning from each other's strengths and limitations.
5. Continuously monitor and evaluate AI performance: Regularly assess AI performance, identify bi

# ↑Wrong

In [41]:
process_and_store("northwave_site.md")

ℹ️ No new documents to add.
✅ Successfully indexed 48 chunks from northwave_site.md


In [42]:
query = "From the Northwave site, what are the services provided by northwave?"
print(chat_with_rag(query))

Based on the retrieved documents, I found that the Northwave site offers Cyber Security Services. The specific services are not explicitly listed in the provided context, but you can visit their website at https://northwave-cybersecurity.com/cyber-security-services to learn more about what they have to offer.


In [44]:
query = "What is the address of Northwave?"
print(chat_with_rag(query))

Based on the retrieved documents, I found that the address of Northwave can be found at the following link:

https://northwave-cybersecurity.com/cyber-security-services?hsLang=en

This is where you can find their Cyber Security Services page.


In [45]:
query = "What is Northwave is located?"
print(chat_with_rag(query))

Based on the retrieved documents, I found that there is no explicit information about the location of Northwave in the provided sources (northwave_ site.md). However, it's possible to infer from the links provided that Northwave may be a cyber security company with an online presence. If you're looking for more specific information about their location, I recommend checking out other credible sources or reaching out to them directly for clarification.


In [46]:
query = "What is Northwave's phone number?"
print(chat_with_rag(query))

Based on the retrieved documents, Northwave's phone number is +31 (0)30 303 1240.


In [47]:
query = "In what country Northwave is located?"
print(chat_with_rag(query))

Based on the retrieved documents, I found that Northwave Cybersecurity has multiple language options for their website, including English, Deutsch (German), Nederlands (Dutch), and Svenska (Swedish). However, there is no explicit mention of the country where Northwave is located.

However, considering the language options provided, it's likely that Northwave is a company based in Europe or has strong ties to European countries.


# ↑ FUCK!!!

# ===================

In [43]:
def count_db_entries():
    """Returns the number of stored documents and chunks in the FAISS database."""
    db = create_or_load_db()
    
    if db is None:
        return "⚠️ No database found. Please add documents first."

    # Number of stored chunks (each vector represents a chunk)
    num_chunks = db.index.ntotal  

    # If metadata is available, count unique documents
    sources = set(doc.metadata.get("source") for doc in db.docstore._dict.values())

    return f"📄 Total Documents: {len(sources)}\n🔹 Total Chunks: {num_chunks}"

# Run this to check the count
print(count_db_entries())

📄 Total Documents: 2
🔹 Total Chunks: 243
