In [6]:
# imports
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_chroma import Chroma

# environment variables
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = "..."
os.environ["LANGCHAIN_PROJECT"] = "..."

# llm
# LLM = ChatOllama(base_url="...", model="llama3:70b", num_thread=96)
LLM = ChatOllama(model="llama3b")

In [7]:
all_splits = []
directory_path = "documents"

for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    
    if os.path.isfile(file_path):
        try:
            print(f"=== loading document: {file_path} ===")
            # 1. Load document
            loader = PyMuPDFLoader(file_path, extract_images=True)
            docs = loader.load()

            # 2. Split document
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            all_splits.extend(text_splitter.split_documents(docs))

        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

=== loading document: documents/eternalblue3.pdf ===
=== loading document: documents/eternalblue7.pdf ===
=== loading document: documents/eternalblue4.pdf ===
=== loading document: documents/eternalblue1.pdf ===
=== loading document: documents/eternalblue5.pdf ===
=== loading document: documents/eternalblue2.pdf ===
=== loading document: documents/eternalblue6.pdf ===


In [8]:
# 3. store documents
print("=== storing documents ===")
# oembed = OllamaEmbeddings(base_url="...", model="nomic-embed-text", num_thread=96)
oembed = OllamaEmbeddings(model="nomic-embed-text")
vectorstore = Chroma.from_documents(documents=all_splits, embedding=oembed)

# 4. retrieve documents
retriever = vectorstore.as_retriever(search_type="similarity")

=== storing documents ===


In [9]:
system_prompt = (
    "You are a helpful assistant. Answer my questions to your best ability and keep the responses to 500 words"
    "\n\n"
    "{context}"
)

final_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

# 7. RAG chain
question_answer_chain = create_stuff_documents_chain(LLM, final_prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [10]:
print("=== generating output ===")
results = rag_chain.invoke({"input": "What is eternalblue"})
print(results['answer'])

=== generating output ===
EternalBlue is a hacking exploit developed by the National Security Agency (NSA) in the United States. It was leaked online in April 2017 by a group of hackers known as The Shadow Brokers.

EternalBlue is a remote access exploit that targets a vulnerability in Windows operating systems, specifically in the SMBv1 (Server Message Block version 1) protocol. This vulnerability, known as MS17-010, was patched by Microsoft in March 2017.

The EternalBlue exploit allows attackers to gain unauthorized access to Windows computers without needing to trick users into opening malicious emails or clicking on links. It can be used to spread malware, ransomware, and other types of cyber threats.

EternalBlue gained notoriety in May 2017 when it was used as part of the WannaCry ransomware attack, which affected over 200,000 computers worldwide. The exploit was also used in the NotPetya malware outbreak later that year.

The EternalBlue exploit has been widely attributed to th