# LangChain and Embeddings
This doesn't work yet.

# Pre-flight

In [15]:
!export LANGCHAIN_PROJECT="pr-ajar-outrun-25"
!source .env

In [25]:
from langchain_core.documents import Document
import chromadb
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
import src.config as c
import json
from uuid import uuid4
from langchain_ollama import ChatOllama

# Fix bug that loads too old of version of sqlite3.
# https://docs.trychroma.com/troubleshooting#sqlite
# https://gist.github.com/defulmere/8b9695e415a44271061cc8e272f3c300?permalink_comment_id=4691192#gistcomment-4691192
# Edit .venv/lib/python3.11/site-packages/chromadb/__init__.py

PROJECT="pr-ajar-outrun-25"

In [26]:
base_url = f"http://{c.GPU_NODE}:11434"
model = "bigstick:simple"

llm = ChatOllama(
    base_url=base_url,
    model=model,
)

# This is here for boring questions that need a quick resonse.
# llm_general = ChatOllama(
#     base_url=base_url,
#     model="llama3.1:latest"
# )

# https://python.langchain.com/v0.1/docs/modules/data_connection/vectorstores/
embeddings = OllamaEmbeddings(base_url=base_url, model=model)

raw = open("./llama-data/data/apache_logs/3.txt").read().strip().split("\n")

#  https://www.geeksforgeeks.org/break-list-chunks-size-n-python/
chunk = 500
all_documents = [
    ((x * chunk) // chunk, raw[x * chunk : (x + 1) * chunk])
    for x in range((len(raw) + chunk - 1) // chunk)
]

db = chromadb.PersistentClient(path="llama-data/embeddings/chroma")


In [19]:
llm.invoke("Hello World!")

AIMessage(content="A classic greeting!\n\nSo, you're interested in learning more about anomalies in data sets? That's a fascinating topic!\n\nAnomalies can be tricky to identify, but they often indicate interesting or important patterns in the data. By ranking each item on a scale of 1-10, you can prioritize your analysis and focus on the most suspicious items first.\n\nWhat kind of data are you working with? Is it numerical, categorical, or something else? Do you have any specific questions about anomaly detection or would you like me to share some general tips and techniques?", additional_kwargs={}, response_metadata={'model': 'bigstick:simple', 'created_at': '2024-10-13T14:39:48.59609906Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 7789897793, 'load_duration': 16788524, 'prompt_eval_count': 100, 'prompt_eval_duration': 477166000, 'eval_count': 114, 'eval_duration': 7172460000}, id='run-fe21e219-e350-4b10-b537-555c2f29e617

# Embed documents

In [27]:
for index, documents in all_documents:
    print(f"Adding document collection: {index+1}/{len(all_documents)}")

    # Add new docs
    uuids = [str(uuid4()) for _ in range(len(documents))]
    metadata = {"source": "apache"}
    docs = []

    for idx, content in enumerate(documents):
        docs.append(Document(page_content=content, id=f"id-{idx}", metadata=metadata))


    collection = Chroma(
        client=db,
        collection_name=f"{PROJECT}-{index}",
        embedding_function=embeddings,
    )
    collection.reset_collection()
    collection.add_documents(documents=docs, ids=uuids)

Adding document collection: 1/1


# Query for documents

In [41]:
log_match = llm.invoke("Provide an example of a URL used in a directory traversal attack on an Apache HTTPD server. Only show unformatted example text.").content

print(f"Looking for {log_match}")
results = collection.similarity_search(log_match)
[ print(x.page_content) for x in results ]

Looking for http://example.com/../../../etc/passwd
page_content='112.110.247.238 - - [17/May/2015:12:05:27 +0000] "GET /images/googledotcom.png HTTP/1.1" 304 - "-" "Maui Browser"' metadata={'source': 'apache'}
page_content='66.168.50.129 - - [17/May/2015:12:05:38 +0000] "GET /presentations/logstash-puppetconf-2012/ HTTP/1.1" 200 37269 "http://semicomplete.com/" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:21.0) Gecko/20100101 Firefox/21.0"' metadata={'source': 'apache'}
page_content='50.131.51.216 - - [17/May/2015:12:05:01 +0000] "GET /favicon.ico HTTP/1.1" 200 3638 "-" "Mozilla/5.0 (Linux; U; en-us; KFAPWI Build/JDQ39) AppleWebKit/535.19 (KHTML, like Gecko) Silk/3.13 Safari/535.19 Silk-Accelerated=true"' metadata={'source': 'apache'}
page_content='146.1.1.2 - - [17/May/2015:12:05:24 +0000] "GET /favicon.ico HTTP/1.1" 200 3638 "-" "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0"' metadata={'source': 'apache'}


[None, None, None, None]

# Iterative reduction testing

In [110]:
results = []
anomaly_count = 2
query = "*"

for index, _ in all_documents:
    collection = Chroma(
        client=db,
        collection_name=f"{PROJECT}-{index}",
        embedding_function=embeddings,
    )

    iterations = round(len(collection.get()["ids"]) - anomaly_count)

    for i in range(iterations):
        current_collection = collection.get()
        lookup = zip(current_collection["ids"], current_collection["documents"])

        print(
            f"Iteration: {i+1}/{iterations}, remaining docs: {len(current_collection['ids'])}"
        )

        # This will help to factor out too matchy
        return_count = len(collection.get()['ids'])
        result = collection.similarity_search(
            query=query,
            k=return_count,
        )

        # We set the query to the least match
        query = result[0].page_content
        print(f"Query set to: {query[:15]}...")

        # If we haven't reached our anomaly count limit then we strip the first matches out of the db.
        if len(current_collection["ids"]) >= anomaly_count:
            uuid = [x for x, msg in lookup if msg == result[0].page_content]
            if uuid:
                collection.delete(uuid)
                print(f"Deleting: {result[0].page_content[:15]}...")
            else:
                print(f"Error when looking for uuid for {result[0].page_content[:15]}")

        # Otherwise we have fewer items left in our collection than our expected anomaly count, we have reached the end!
        else:
            break


print(json.dumps(collection.get()["documents"], indent=4))

Iteration: 1/10, remaining docs: 12
Query set to: 108.178.4.100 -...
Deleting: 108.178.4.100 -...
Iteration: 2/10, remaining docs: 11
Query set to: 108.178.4.100 -...
Deleting: 108.178.4.100 -...
Iteration: 3/10, remaining docs: 10
Query set to: 111.199.235.239...
Deleting: 111.199.235.239...
Iteration: 4/10, remaining docs: 9
Query set to: 111.199.235.239...
Deleting: 111.199.235.239...
Iteration: 5/10, remaining docs: 8
Query set to: 111.199.235.239...
Deleting: 111.199.235.239...
Iteration: 6/10, remaining docs: 7
Query set to: 111.199.235.239...
Deleting: 111.199.235.239...
Iteration: 7/10, remaining docs: 6
Query set to: 111.199.235.239...
Deleting: 111.199.235.239...
Iteration: 8/10, remaining docs: 5
Query set to: 173.192.238.41 ...
Deleting: 173.192.238.41 ...
Iteration: 9/10, remaining docs: 4
Query set to: 107.170.41.69 -...
Deleting: 107.170.41.69 -...
Iteration: 10/10, remaining docs: 3
Query set to: 93.164.60.142 -...
Deleting: 93.164.60.142 -...
[
    "50.16.19.13 - - [17

In [27]:
data = collection.get()
line = '180.76.5.27 - - [17/May/2015:12:05:09 +0000] "GET /misc/rcfiles/ion3/look_simpleblue.lua HTTP/1.1" 200 3179 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"'
lookup = zip(data['ids'], data['documents'])

[ print(f"{x}: {msg}") for x,msg in lookup if msg == line ]

5c457ed5-3015-48b1-a8e1-c80c853234dc: 180.76.5.27 - - [17/May/2015:12:05:09 +0000] "GET /misc/rcfiles/ion3/look_simpleblue.lua HTTP/1.1" 200 3179 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
7a9e85ea-55e5-4371-bfb5-fa93639a1b1e: 180.76.5.27 - - [17/May/2015:12:05:09 +0000] "GET /misc/rcfiles/ion3/look_simpleblue.lua HTTP/1.1" 200 3179 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
e7662175-9833-486b-a76d-7391868c734a: 180.76.5.27 - - [17/May/2015:12:05:09 +0000] "GET /misc/rcfiles/ion3/look_simpleblue.lua HTTP/1.1" 200 3179 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"


[None, None, None]

In [5]:
model = OllamaLLM(model="llama3.1:70b")

template = """
Question: Can you find any anomalies in this data?
Data: {data}
Answer: There are <number of lines> in this data. I have found the following anomalies on:
    <line>: <message>
"""
prompt = ChatPromptTemplate.from_template(template=template)
model = OllamaLLM(model="llama3.1:70b", base_url="http://g005:11434")
chain = prompt | model
chain.invoke({"data": results})

'There are 16 lines in this data. I have found the following anomalies on:\n\nLine 9: The request came from a user-agent identifying itself as "Tiny Tiny RSS/1.11", which is unusual because this software is typically used for reading RSS feeds, not making HTTP requests.\n\nLine 15: The request came from an IP address that resolves to a domain name belonging to the Yandex search engine, but the User-Agent string does not match any known Yandex crawler.\n\nNote that these anomalies may not necessarily indicate malicious activity, and further investigation would be needed to determine the nature of these requests.'

# LLama Index


In [33]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    get_response_synthesizer,
    Settings,
)
from llama_index.core.readers.base import BaseReader
from llama_index.core import Document
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama


In [37]:
class TxtReader(BaseReader):
    def load_data(self, file, extra_info=None):
        with open(file, "r") as f:
            text = f.read()
        # load_data returns a list of Document objects
        return [Document(text=text, extra_info=extra_info or {})]


reader = SimpleDirectoryReader(
    input_files=["llama-data/data/apache_logs/2.txt"],
    file_extractor={".txt": TxtReader()},
)
documents = reader.load_data()

In [38]:
Settings.llm = Ollama(model="llama3.1:latest", request_timeout=360.0)
Settings.embed_model = OllamaEmbedding(
    model_name="llama3.1:70b",
    base_url="http://g005:11434",
    ollama_additional_kwargs={"mirostat": 0},
)

In [39]:
# build index
index = VectorStoreIndex.from_documents(documents)

retriever = VectorIndexRetriever(index=index, similarity_top_k=100)
response_synth = get_response_synthesizer()
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synth,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

response = query_engine.query("How many lines are in the loaded data?")
print(response)

Empty Response
