## Import

In [None]:
import requests
import bs4
import re
import os
import hashlib
import tqdm
import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.runnables import RunnablePassthrough
from sentence_transformers import SentenceTransformer, util

In [2]:
# ANSI escape codes for colors
PINK = '\033[95m'
CYAN = '\033[96m'
YELLOW = '\033[93m'
NEON_GREEN = '\033[92m'
RESET_COLOR = '\033[0m'

## Collect Report

In [3]:
def scrape_dfir_report(url):
    loader = WebBaseLoader(
        web_paths=(url,),
        bs_kwargs=dict(
            parse_only=bs4.SoupStrainer(
                class_=("entry-content", "entry-title", "entry-header")
            )
        ),
    )
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    dfir_content = ""
    for split in tqdm.tqdm(splits, desc="Ingestion progress"):
        dfir_content += str(split)
    dfir_content += "\n"
    return dfir_content

def scrape_hacker_news(url):
    loader = WebBaseLoader(
        web_paths=(url,),
        bs_kwargs=dict(
            parse_only=bs4.SoupStrainer(
                class_=("content section", "articlebody", "story-title")
            )
        ),
    )
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    hn_content = ""
    for split in tqdm.tqdm(splits, desc="Ingestion progress"):
        hn_content += str(split)
    hn_content += "\n"
    return hn_content

def scrape_bleeping_computer(url):
    loader = WebBaseLoader(
        web_paths=(url,),
        bs_kwargs=dict(
            parse_only=bs4.SoupStrainer(
                class_=("article_section", "articleBody")
            )
        ),
    )
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    bc_content = ""
    for split in tqdm.tqdm(splits, desc="Ingestion progress"):
        bc_content += str(split)
    bc_content += "\n"
    return bc_content

def save_to_txt(content, filename):
    with open(filename, 'a', encoding='utf-8') as file:
        file.write(content)

while True:
    option = input("Please choose an option:\n1. Scrape DFIR Report\n2. Scrape Hacker News\n3. Scrape BleepingComputer\nEnter 'quit' to complete the task: ")
    
    if option == 'quit':
        break
    
    url = input("Please enter the URL: ")
    
    if option == '1':
        # Scrape DFIR Report
        blog_content = scrape_dfir_report(url)
    elif option == '2':
        # Scrape Hacker News
        blog_content = scrape_hacker_news(url)
    elif option == '3':
        # Scrape BleepingComputer
        blog_content = scrape_bleeping_computer(url)
    else:
        print("Invalid option. Please choose either 1, 2, or 3")
        continue
    
    # Save the content to a common file
    if blog_content:
        save_to_txt(blog_content, 'blog_post.txt')
        print("Blog content appended to 'blog_post.txt'")
    else:
        print("Failed to scrape blog content!")


Ingestion progress: 100%|██████████| 4/4 [00:00<00:00, 4009.85it/s]

Blog content appended to 'blog_post.txt'





## Normalize Text

In [5]:
def upload_txtfile():
    # Normalize whitespace and clean up text
    with open("blog_post.txt", "r", encoding="utf-8") as vault_file:
        text = vault_file.read()
        text = re.sub(r'\s+', ' ', text).strip()

        # Split text into chunks by sentences, respecting a maximum chunk size
        sentences = re.split(r'(?<=[.!?]) +', text)  # split on spaces following sentence-ending punctuation
        chunks = []
        current_chunk = ""
        for sentence in tqdm.tqdm(sentences, desc="Splitting text into chunks"):
            # Check if the current sentence plus the current chunk exceeds the limit
            if len(current_chunk) + len(sentence) + 1 < 1000:  # +1 for the space
                current_chunk += (sentence + " ").strip()
            else:
                # When the chunk exceeds 1000 characters, store it and start a new one
                chunks.append(current_chunk)
                current_chunk = sentence + " "
        if current_chunk:  # Don't forget the last chunk!
            chunks.append(current_chunk)

        # Write each chunk to its own line
        with open("vault.txt", "w", encoding="utf-8") as vault_file:
            for chunk in tqdm.tqdm(chunks, desc="Writing chunks to file"):
                vault_file.write(chunk.strip() + "\n\n")  # Two newlines to separate chunks

    # Calculate the MD5 hash of the modified vault.txt file
    md5_hash = hashlib.md5()
    with open("vault.txt", "rb") as vault_file:
        for chunk in iter(lambda: vault_file.read(4096), b""):
            md5_hash.update(chunk)
    md5_hash = md5_hash.hexdigest()

    # Save the MD5 hash to a PID file
    with open("vault.pid", "w") as pid_file:
        pid_file.write(md5_hash)

    print(NEON_GREEN + "Text file content appended to vault.txt with each chunk on a separate line. MD5 hash:" + YELLOW + md5_hash + RESET_COLOR)
    print(NEON_GREEN + "MD5 hash saved to vault.pid" + RESET_COLOR)

# Call the upload_txtfile function
upload_txtfile()

Splitting text into chunks: 100%|██████████| 543/543 [00:00<00:00, 356920.09it/s]
Writing chunks to file: 100%|██████████| 148/148 [00:00<00:00, 148470.94it/s]

[92mText file content appended to vault.txt with each chunk on a separate line. MD5 hash:[93m6716ecf284b84ce660f8fff35a50023f[0m
[92mMD5 hash saved to vault.pid[0m





## Local LLM

In [7]:
# Function to open a file and return its contents as a string
def open_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as infile:
        return infile.read()

# Function to get relevant context from the vault based on user input
def get_relevant_context(user_input, vault_embeddings, vault_content, model, top_k=3):
    if vault_embeddings.nelement() == 0:  # Check if the tensor has any elements
        return []
    input_embedding = model.encode([user_input])
    cos_scores = util.cos_sim(input_embedding, vault_embeddings)[0]
    top_k = min(top_k, len(cos_scores))
    top_indices = torch.topk(cos_scores, k=top_k)[1].tolist()
    relevant_context = [vault_content[idx].strip() for idx in top_indices]
    return relevant_context

# Function to chat with the Ollama model
def ollama_chat(user_input, system_message, vault_embeddings, vault_content, model, model_name="mistral"):
    relevant_context = get_relevant_context(user_input, vault_embeddings, vault_content, model)
    
    if relevant_context:
        context_str = "\n".join(relevant_context)
        user_input_with_context = context_str + "\n\n" + user_input
    else:
        print(CYAN + "No relevant context found." + RESET_COLOR)
        user_input_with_context = user_input

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_input_with_context}
    ]

    response = requests.post(
        "http://localhost:11434/v1/chat/completions",
        json={
            "model": model_name,
            "messages": messages,
            "stream": False
        }
    )

    if response.status_code != 200:
        raise Exception(f"Ollama Error: {response.status_code} - {response.text}")
    
    return response.json()["choices"][0]["message"]["content"]

# Load the SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2")
file_name = "embeddings.pt"
vault_content = []

# Function to create embeddings and save to disk
def create_embeddings(vault_content, model):
    vault_embeddings = model.encode(vault_content) if vault_content else []
    vault_embeddings_tensor = torch.tensor(vault_embeddings)
    save_embeddings(vault_embeddings_tensor)
    return vault_embeddings_tensor

# Save embeddings to a file
def save_embeddings(embeddings):
    torch.save(embeddings, file_name)

# Calculate the hash of vault.pid
with open("vault.pid", "rb") as vault_file:
    vault_hash = hashlib.sha256(vault_file.read()).hexdigest()

# Check hash to decide whether to reuse or regenerate embeddings
if os.path.exists("hash.pid") and open("hash.pid", "r").read() == vault_hash:
    with open("vault.txt", "r", encoding='utf-8') as vault_file:
        vault_content = vault_file.readlines()
    vault_embeddings_tensor = torch.load(file_name)
else:
    with open("vault.txt", "r", encoding='utf-8') as vault_file:
        vault_content = vault_file.readlines()
        vault_embeddings_tensor = create_embeddings(vault_content, model)
    with open("hash.pid", "w") as hash_file:
        hash_file.write(vault_hash)
    torch.save(vault_embeddings_tensor, "embeddings.pt")

# Main chat loop
while True:
    user_input = input(YELLOW + "Ask a question about your documents, or type 'quit' to end the chat: " + RESET_COLOR)
    
    if user_input.lower() == 'quit':
        break

    system_message = "You are a helpful assistant that is an expert at extracting the most useful information from a given text."
    try:
        response = ollama_chat(user_input, system_message, vault_embeddings_tensor, vault_content, model, model_name="deepseek-coder-v2:16b")
        print(NEON_GREEN + "deepseek-coder-v2:16b Response:\n\n" + response + RESET_COLOR)
    except Exception as e:
        print(PINK + f"Error: {e}" + RESET_COLOR)


[92mdeepseek-coder-v2:16b Response:

 {
  "IOCs": [
    {
      "Name": "config.cfg",
      "File Size": "27392 Byte(s) SHA256: 28a9982cf2b4fc53a1545b6ed0d0c1788ca9369a847750f5652ffa0ca7f7b7d3"
    },
    {
      "Name": "config.cfg",
      "File Size": "28268 Byte(s) SHA256: 8afd6c0636c5d70ac0622396268786190a428635e9cf28ab23add939377727b0"
    },
    {
      "Domain": "bunch-balance-councils[.]trycloudflare[.]com",
      "Domain": "ferrari-rolling-facilities-lounge[.]trycloudflare[.]com",
      "Domain": "galleries-physicians-psp-wv[.]trycloudflare[.]com",
      "Domain": "evidence-deleted-procedure-bringing[.]trycloudflare[.]com",
      "Domain": "nowhere-locked-manor-hs[.]trycloudflare[.]com",
      "Domain": "ranked-accordingly-ab-hired[.]trycloudflare[.]com"
    },
    {
      "IP": "64[.]95[.]12[.]71",
      "IP": "184[.]95[.]51[.]165"
    }
  ]
}[0m
[92mdeepseek-coder-v2:16b Response:

 To provide accurate help, I need to see the text you mentioned. Please share it so I can a