# LangChain and Embeddings
This doesn't work yet.

# Pre-flight

In [15]:
!export LANGCHAIN_PROJECT="pr-ajar-outrun-25"
!source .env

In [15]:
from langchain_core.documents import Document
import chromadb
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
import src.config as c
import json
from uuid import uuid4
from langchain_ollama import ChatOllama

# Fix bug that loads too old of version of sqlite3.
# https://docs.trychroma.com/troubleshooting#sqlite
# https://gist.github.com/defulmere/8b9695e415a44271061cc8e272f3c300?permalink_comment_id=4691192#gistcomment-4691192
# Edit .venv/lib/python3.11/site-packages/chromadb/__init__.py

PROJECT="pr-ajar-outrun-25"

In [16]:
base_url = f"http://{c.GPU_NODE}:11434"
model = "bigstick:simple"

llm = ChatOllama(
    base_url=base_url,
    model=model,
)

# This is here for boring questions that need a quick resonse.
# llm_general = ChatOllama(
#     base_url=base_url,
#     model="llama3.1:latest"
# )

# https://python.langchain.com/v0.1/docs/modules/data_connection/vectorstores/
embeddings = OllamaEmbeddings(base_url=base_url, model=model)

raw = open("./llama-data/data/apache_logs/3.txt").read().strip().split("\n")

#  https://www.geeksforgeeks.org/break-list-chunks-size-n-python/
chunk = 500
all_documents = [
    ((x * chunk) // chunk, raw[x * chunk : (x + 1) * chunk])
    for x in range((len(raw) + chunk - 1) // chunk)
]

db = chromadb.PersistentClient(path="llama-data/embeddings/chroma")


In [3]:
llm.invoke("Hello World!")

AIMessage(content="A classic greeting! It looks like you're ready to start exploring some data sets and identifying those pesky anomalies! Which type of data set would you like to examine? Do you have a specific one in mind or would you like me to provide some examples?", additional_kwargs={}, response_metadata={'model': 'bigstick:simple', 'created_at': '2025-04-17T16:26:23.325599598Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 8300338477, 'load_duration': 4413867659, 'prompt_eval_count': 100, 'prompt_eval_duration': 606114030, 'eval_count': 53, 'eval_duration': 3279530034}, id='run-7125af25-7d31-406b-8b9f-7144dc701497-0', usage_metadata={'input_tokens': 100, 'output_tokens': 53, 'total_tokens': 153})

# Embed documents

In [17]:
for index, documents in all_documents:
    print(f"Adding document collection: {index+1}/{len(all_documents)}")

    # Add new docs
    uuids = [str(uuid4()) for _ in range(len(documents))]
    metadata = {"source": "apache"}
    docs = []

    for idx, content in enumerate(documents):
        docs.append(Document(page_content=content, id=f"id-{idx}", metadata=metadata))


    collection = Chroma(
        client=db,
        collection_name=f"{PROJECT}-{index}",
        embedding_function=embeddings,
    )
    collection.reset_collection()
    collection.add_documents(documents=docs, ids=uuids)

Adding document collection: 1/1


# Query for documents

In [5]:
log_match = llm.invoke("Provide an example of a URL used in a directory traversal attack on an Apache HTTPD server. Only show unformatted example text.").content

print(f"Looking for {log_match}")
results = collection.similarity_search(log_match)
[ print(x.page_content) for x in results ]

Looking for http://example.com/../etc/passwd
112.110.247.238 - - [17/May/2015:12:05:27 +0000] "GET /images/googledotcom.png HTTP/1.1" 304 - "-" "Maui Browser"
66.168.50.129 - - [17/May/2015:12:05:38 +0000] "GET /presentations/logstash-puppetconf-2012/ HTTP/1.1" 200 37269 "http://semicomplete.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:21.0) Gecko/20100101 Firefox/21.0"
146.1.1.2 - - [17/May/2015:12:05:24 +0000] "GET /favicon.ico HTTP/1.1" 200 3638 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0"
50.131.51.216 - - [17/May/2015:12:05:01 +0000] "GET /favicon.ico HTTP/1.1" 200 3638 "-" "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true"


[None, None, None, None]

# Iterative reduction testing

In [18]:
results = []
anomaly_count = 4
query = "*"

for index, _ in all_documents:
    collection = Chroma(
        client=db,
        collection_name=f"{PROJECT}-{index}",
        embedding_function=embeddings,
    )

    iterations = round(len(collection.get()["ids"]) - anomaly_count)

    for i in range(iterations):
        current_collection = collection.get()
        lookup = zip(current_collection["ids"], current_collection["documents"])

        print(
            f"Iteration: {i+1}/{iterations}, remaining docs: {len(current_collection['ids'])}"
        )

        # This will help to factor out too matchy
        return_count = len(collection.get()['ids'])
        result = collection.similarity_search(
            query=query,
            k=return_count,
        )

        # We set the query to the least match
        query = result[0].page_content
        print(f"Query set to: {query[:15]}...")

        # If we haven't reached our anomaly count limit then we strip the first matches out of the db.
        if len(current_collection["ids"]) >= anomaly_count:
            uuid = [x for x, msg in lookup if msg == result[0].page_content]
            if uuid:
                collection.delete(uuid)
                print(f"Deleting: {result[0].page_content[:15]}...")
            else:
                print(f"Error when looking for uuid for {result[0].page_content[:15]}")

        # Otherwise we have fewer items left in our collection than our expected anomaly count, we have reached the end!
        else:
            break


print(json.dumps(collection.get()["documents"], indent=4))

Iteration: 1/96, remaining docs: 100
Query set to: 66.168.50.129 -...
Deleting: 66.168.50.129 -...
Iteration: 2/96, remaining docs: 99
Query set to: 108.178.4.100 -...
Deleting: 108.178.4.100 -...
Iteration: 3/96, remaining docs: 98
Query set to: 108.178.4.100 -...
Deleting: 108.178.4.100 -...
Iteration: 4/96, remaining docs: 97
Query set to: 108.32.74.68 - ...
Deleting: 108.32.74.68 - ...
Iteration: 5/96, remaining docs: 96
Query set to: 108.32.74.68 - ...
Deleting: 108.32.74.68 - ...
Iteration: 6/96, remaining docs: 95
Query set to: 108.32.74.68 - ...
Deleting: 108.32.74.68 - ...
Iteration: 7/96, remaining docs: 94
Query set to: 108.32.74.68 - ...
Deleting: 108.32.74.68 - ...
Iteration: 8/96, remaining docs: 93
Query set to: 108.32.74.68 - ...
Deleting: 108.32.74.68 - ...
Iteration: 9/96, remaining docs: 92
Query set to: 108.32.74.68 - ...
Deleting: 108.32.74.68 - ...
Iteration: 10/96, remaining docs: 91
Query set to: 108.32.74.68 - ...
Deleting: 108.32.74.68 - ...
Iteration: 11/96, 

In [7]:
data = collection.get()
line = '180.76.5.27 - - [17/May/2015:12:05:09 +0000] "GET /misc/rcfiles/ion3/look_simpleblue.lua HTTP/1.1" 200 3179 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"'
lookup = zip(data['ids'], data['documents'])

[ print(f"{x}: {msg}") for x,msg in lookup if msg == line ]

[]

In [8]:
model = OllamaLLM(model="llama3.1:70b")

template = """
Question: Can you find any anomalies in this data?
Data: {data}
Answer: There are <number of lines> in this data. I have found the following anomalies on:
    <line>: <message>
"""
prompt = ChatPromptTemplate.from_template(template=template)
model = OllamaLLM(model="llama3.1:70b", base_url="http://g005:11434")
chain = prompt | model
chain.invoke({"data": results})

'Here is a code snippet that fulfills your requirements:\n\n```python\ndef find_anomalies(data):\n    """\n    This function takes in a list of data and returns any anomalies found.\n    \n    Parameters:\n    data (list): A list containing the data to be analyzed\n    \n    Returns:\n    str: A string describing the number of lines in the data and any anomalies found\n    """\n\n    # Initialize an empty dictionary to store anomalies\n    anomalies = {}\n\n    # Check if data is not empty\n    if len(data) == 0:\n        return "There are 0 lines in this data. No anomalies were found."\n    \n    # Iterate over each line in the data\n    for i, line in enumerate(data):\n        # Check for any specific anomalies (this example checks for empty strings)\n        if not line.strip():\n            anomalies[i+1] = "This line is empty."\n\n    # If no anomalies are found, return a message stating so\n    if len(anomalies) == 0:\n        return f"There are {len(data)} lines in this data. No

# LLama Index


In [9]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    get_response_synthesizer,
    Settings,
)
from llama_index.core.readers.base import BaseReader
from llama_index.core import Document
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama


In [10]:
class TxtReader(BaseReader):
    def load_data(self, file, extra_info=None):
        with open(file, "r") as f:
            text = f.read()
        # load_data returns a list of Document objects
        return [Document(text=text, extra_info=extra_info or {})]


reader = SimpleDirectoryReader(
    input_files=["llama-data/data/apache_logs/2.txt"],
    file_extractor={".txt": TxtReader()},
)
documents = reader.load_data()

In [11]:
Settings.llm = Ollama(model="llama3.1:70b", request_timeout=360.0)
Settings.embed_model = OllamaEmbedding(
    model_name="llama3.1:70b",
    base_url="http://g005:11434",
    ollama_additional_kwargs={"mirostat": 0},
)

In [12]:
# build index
index = VectorStoreIndex.from_documents(documents)

retriever = VectorIndexRetriever(index=index, similarity_top_k=100)
response_synth = get_response_synthesizer()
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synth,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

response = query_engine.query("How many lines are in the loaded data?")
print(response)

Empty Response
