# HoneyBee Test File

This file is here just to test the HoneyBee functions as I am writing them. This will be deleted once the functions are working properly and a more formal test file will be created in the CI/CD pipeline.

## Medical Data Loaders

1. Loads the data based on the data type
1. Then after validating the data, returns an array of embeddings using the model provided
1. The embeddings can then be
     - stored locally
     - stored in a Vector Database
     - used for further processing, i.e. making a dataframe with more columns for other metadata or data 
     - be uploaded to huggingface datasets

In [1]:
from honeybee.loaders import SVSLoader, DICOMLoader, PDFLoader, MINDSLoader

# Load SVS
svs_loader = SVSLoader()
svs_data = svs_loader.load("path/to/svs_file.svs")

# Load DICOM
dicom_loader = DICOMLoader(embedding_model_path="path/to/embedding_model")
dicom_data = dicom_loader.load("path/to/dicom_file.dcm")

# Load PDF
pdf_loader = PDFLoader()
pdf_data = pdf_loader.load("path/to/pdf_file.pdf")

# Load data from MINDS
minds_loader = MINDSLoader(data_dir="path/to/minds_data")

print(svs_data)
print(dicom_data)
print(pdf_data)


path/to/svs_file.svs
path/to/dicom_file.dcm
path/to/pdf_file.pdf


## Data distribution (VectorDB, Local, Huggingface)

## Basic Applications (RAG, Instruction Tuning, PEFT)

## Advanced Applications (Advanced RAG, Federated Learning)

---

In [None]:
import ollama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings

# loader = PyPDFLoader("example_data/layout-parser-paper.pdf")
# pages = loader.load_and_split()


def web_loader_and_retrieve_docs(url):
    loader = WebBaseLoader(web_paths=(url,), bs_kwargs=dict())
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    embeddings = OllamaEmbeddings(model="mistral")
    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
    return vectorstore.as_retriever()


def pdf_loader_and_retrieve_docs(path):
    loader = PyPDFLoader(path)
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    embeddings = OllamaEmbeddings(model="mistral")
    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
    return vectorstore.as_retriever()


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


def ollama_llm(question, context):
    formatted_prompt = f"Question: {question}\n\nContext: {context}"
    response = ollama.chat(
        model="mistral", messages=[{"role": "user", "content": formatted_prompt}]
    )
    return response["message"]["content"]


def rag_chain(url, question):
    if url.endswith(".pdf"):
        retriever = pdf_loader_and_retrieve_docs(url)
    else:
        retriever = web_loader_and_retrieve_docs(url)
    retrieved_docs = retriever.invoke(question)
    formatted_context = format_docs(retrieved_docs)
    formatted_prompt = f"Question: {question}\n\nContext: {formatted_context}"
    response = ollama.chat(
        model="mistral", messages=[{"role": "user", "content": formatted_prompt}]
    )
    return response["message"]["content"]


# Use the RAG chain
result = rag_chain("http://www.espn.com", "what is the top news on espn?")
# result = rag_chain("/mnt/f/Projects/HoneyBee/data/llama2.pdf", "What is llama 2?")
print(result)

---

In [None]:
from datasets import load_dataset
import pandas as pd
import ollama
import pymongo


def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None


def get_embedding(text):
    if not text or not isinstance(text, str):
        return None
    try:
        embedding = ollama.embeddings(model="mistral", prompt=text)
        embedding = embedding["embedding"]

        return embedding
    except Exception as e:
        print(f"Error in get_embedding: {e}")
        return None


def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "plot_embedding_optimised",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 5,  # Return top 5 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "plot": 1,  # Include the plot field
                "title": 1,  # Include the title field
                "genres": 1,  # Include the genres field
                "score": {
                    "$meta": "vectorSearchScore"  # Include the search score
                },
                "plot_embedding_optimised": 1,
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)


def handle_user_query(query, collection):
    get_knowledge = vector_search(query, collection)

    search_result = ""
    for result in get_knowledge:
        search_result += (
            f"Title: {result.get('title', 'N/A')}, Plot: {result.get('plot', 'N/A')}\n"
        )

    completion = ollama.chat(
        model="mistral",
        messages=[
            {"role": "system", "content": "You are a movie recommendation system."},
            {
                "role": "user",
                "content": "Answer this user query: "
                + query
                + " with the following context: "
                + search_result,
            },
        ],
        stream=True,
    )

    return completion, search_result

In [None]:
dataset = load_dataset("AIatMongoDB/embedded_movies")
dataset_df = pd.DataFrame(dataset["train"])
dataset_df = dataset_df.dropna(subset=["plot"])
dataset_df = dataset_df.drop(columns=["plot_embedding"])
dataset_df["plot_embedding_optimised"] = dataset_df["plot"].apply(get_embedding)

In [None]:
mongo_client = get_mongo_client(
    "mongodb://root:root@localhost:27778/?directConnection=true"
)
db = mongo_client["movies"]
collection = db["movie_collection"]
collection.delete_many({})
documents = dataset_df.to_dict("records")
collection.insert_many(documents)

In [None]:
# 6. Conduct query with retrival of sources
query = "What is the best romantic movie to watch?"
response, source_information = handle_user_query(query, collection)

for chunk in response:
    print(chunk["message"]["content"], end="", flush=True)

print(f"Source Information: \n{source_information}")