In [1]:
from typing import Any, Dict

from langchain.embeddings import HuggingFaceInstructEmbeddings, HuggingFaceEmbeddings
from langchain.embeddings.base import Embeddings


def get_embeddings(config: Dict[str, Any]) -> Embeddings:
    config = {**config["embeddings"]}
    config["model_name"] = config.pop("model")
    if config["model_name"].startswith("hkunlp/"):
        Provider = HuggingFaceInstructEmbeddings
    else:
        Provider = HuggingFaceEmbeddings
    return Provider(**config)

In [2]:
from typing import Any, Dict, List

from chromadb.config import Settings
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.vectorstores.base import VectorStore


def get_vectorstore(config: Dict[str, Any]) -> VectorStore:
    embeddings = get_embeddings(config)
    config = config["chroma"]
    return Chroma(
        persist_directory=config["persist_directory"],
        embedding_function=embeddings,
        client_settings=Settings(**config),
    )


def get_vectorstore_from_documents(
    config: Dict[str, Any],
    documents: List[Document],
) -> VectorStore:
    embeddings = get_embeddings(config)
    config = config["chroma"]
    return Chroma.from_documents(
        documents,
        embeddings,
        persist_directory=config["persist_directory"],
        client_settings=Settings(**config),
    )

In [3]:
from langchain.llms.base import LLM


def get_llm(config: Dict[str, Any]) -> LLM:
    import importer
    from langchain_extras.llms.exllama import Exllama, BasicStreamingHandler

    handler = BasicStreamingHandler()
    config = config["exllama"]

    print(f"Loading exllama llm from {config['model']}")

    return (
        Exllama(
            model_path=config.pop("model"),
            lora_path=None,
            temperature=0.7,
            beams=1,
            beam_length=40,
            stop_sequences=["Human:", "User:", "AI:"],
            # callbacks=[handler],
            verbose=False,
            max_seq_len=4096,
            alpha_value=4.0,  # For use with any models
            compress_pos_emb=4.0,  # For use with superhot
            # set_auto_map = "3, 2" #Gpu split, this will split 3gigs/2gigs
        ),
        handler,
    )

In [4]:
from typing import Any, Callable, Dict, Optional
from typing import Any, Callable, Dict, Optional

from langchain.chains import RetrievalQA


def get_retrieval_qa(config, *, callback: Optional[Callable[[str], None]] = None):
    db = get_vectorstore(config)
    retriever = db.as_retriever(**config["retriever"])
    llm, handler = get_llm(config)
    chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
    )
    handler.set_chain(chain.combine_documents_chain.llm_chain)
    return chain

In [5]:
import os
import glob
from langchain.document_loaders import DirectoryLoader, TextLoader, JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


def does_vectorstore_exist(persist_directory: str) -> bool:
    """
    Checks if vectorstore exists
    """
    if os.path.exists(os.path.join(persist_directory, "index")):
        if os.path.exists(
            os.path.join(persist_directory, "chroma-collections.parquet")
        ) and os.path.exists(
            os.path.join(persist_directory, "chroma-embeddings.parquet")
        ):
            list_index_files = glob.glob(os.path.join(persist_directory, "index/*.bin"))
            list_index_files += glob.glob(
                os.path.join(persist_directory, "index/*.pkl")
            )
            # At least 3 documents are needed in a working vectorstore
            if len(list_index_files) > 3:
                return True
    return False


def get_priority_human(priority: str) -> str:
    """
    Returns human readable priority
    """
    match priority:
        case "0":
            return "emerg"
        case "1":
            return "alert"
        case "2":
            return "crit"
        case "3":
            return "err"
        case "4":
            return "warning"
        case "5":
            return "notice"
        case "6":
            return "info"
        case "7":
            return "debug"
        case _:
            return "unknown"


def get_log_message_metadata(message: dict, metadata: dict) -> dict:
    """
    Extracts metadata from log message
    """
    metadata["priority"] = message.get("PRIORITY", "")
    metadata["priority_human"] = get_priority_human(message.get("PRIORITY", ""))
    metadata["facility"] = message.get("FACILITY", "")
    metadata["unit"] = message.get("UNIT", message.get("USER_UNIT", ""))

    return metadata


def process_documents(source_directory: str):
    print(f"Loading documents from {source_directory}")
    results = []
    doc = JSONLoader(
        file_path=os.path.join(source_directory, "system/sddm.json"),
        jq_schema=".",
        content_key="MESSAGE",
        metadata_func=get_log_message_metadata,
        json_lines=True,
    )
    print(doc.load())
    loader = DirectoryLoader(
        source_directory,
        glob="**/*.json",
        show_progress=True,
        use_multithreading=True,
        loader_cls=JSONLoader,
        loader_kwargs={
            "jq_schema": ".",
            "content_key": "MESSAGE",
            "metadata_func": get_log_message_metadata,
            "json_lines": True,
        },
    )
    documents = loader.load()
    print(f"Loaded {len(documents)} new documents from {source_directory}")

    # text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    # texts = text_splitter.split_documents(documents)
    return documents


def load_documents(config: Dict[str, Any], source_directory: str) -> None:
    # if does_vectorstore_exist(persist_directory):
    #     # Update and store locally vectorstore
    #     print(f"Appending to existing vectorstore at {persist_directory}")
    #     db = get_vectorstore(config)
    #     texts = process_documents(
    #         source_directory,
    #     )
    #     print(f"Creating embeddings. May take a few minutes...")
    #     db.add_documents(texts)
    # else:
    # Create and store locally vectorstore
    if does_vectorstore_exist(config["chroma"]["persist_directory"]):
        return

    print("Creating new vectorstore")
    texts = process_documents(source_directory)
    print(f"Creating embeddings. May take a few minutes...")
    db = get_vectorstore_from_documents(config, texts)
    db.persist()
    print("Finished processing documents")
    db = None

In [6]:
config = {
    "chroma": {"persist_directory": "db", "chroma_db_impl": "duckdb+parquet"},
    "exllama": {"model": "../models/TheBloke_GPT4All-13B-Snoozy-SuperHOT-8K-GPTQ"},
    "retriever": {"search_kwargs": {"k": 4}},
    "embeddings": {"model": "all-mpnet-base-v2"},
}
load_documents(config, source_directory="service-logs")
qa = get_retrieval_qa(config)

Creating new vectorstore
Loading documents from service-logs


100%|██████████| 13/13 [00:01<00:00,  7.91it/s]


Loaded 5566 new documents from service-logs
Creating embeddings. May take a few minutes...


2023-07-14 16:18:47.339988: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-14 16:18:47.516123: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-07-14 16:18:48.052971: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-07-14 16:18:48.053072: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

Finished processing documents
/home/jovyan/work/importer.py
Loading exllama llm from ../models/TheBloke_GPT4All-13B-Snoozy-SuperHOT-8K-GPTQ


In [7]:
def query(question: str):
    result = qa(question)
    print(result["source_documents"])
    print(result["query"])
    print()
    print(result["result"])
    print()
    print()


query("Give me any mentions of NVIDIA in the source documents")
query("Give me any errors that occurred in the source documents")

[Document(page_content="<info>  [1688041770.1671] device (wlp0s20f3): Activation: (wifi) connection 'fiorinet' has security, and secrets exist.  No new secrets needed.", metadata={'source': '/var/home/joe/Projects/code/ml-playground/System Monitor/service-logs/system/NetworkManager.json', 'seq_num': 2161, 'priority': '6', 'priority_human': 'info', 'facility': '', 'unit': ''}), Document(page_content="<info>  [1688130077.2424] device (wlp0s20f3): Activation: (wifi) connection 'fiorinet' has security, and secrets exist.  No new secrets needed.", metadata={'source': '/var/home/joe/Projects/code/ml-playground/System Monitor/service-logs/system/NetworkManager.json', 'seq_num': 2386, 'priority': '6', 'priority_human': 'info', 'facility': '', 'unit': ''}), Document(page_content="<info>  [1688041290.2263] device (wlp0s20f3): Activation: (wifi) connection 'fiorinet' has security, and secrets exist.  No new secrets needed.", metadata={'source': '/var/home/joe/Projects/code/ml-playground/System Mo