In [None]:
!pip install pymongo
!pip install dotenv
!pip install chromadb
!pip install -U sentence-transformers

Collecting sentence_transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting torch>=1.11.0 (from sentence_transformers)
  Downloading torch-2.7.0-cp39-cp39-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting scikit-learn (from sentence_transformers)
  Downloading scikit_learn-1.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting networkx (from torch>=1.11.0->sentence_transformers)
  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.6.77-py3-non

In [1]:
import os
import uuid
from datetime import datetime
from typing import List, Dict

import chromadb
from chromadb.utils import embedding_functions
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import OllamaLLM
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.documents import Document
from langchain_ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma
from pymongo import MongoClient
from dotenv import load_dotenv


In [36]:
load_dotenv()
class MongoConnectionHandler:
    def __init__ (self):
        self.client = MongoClient(os.getenv('MONGO_URL'))
        self.db=self.client.get_database(os.getenv('MONGO_DB'))
    
    def get_related_documents(self) -> List[Dict]:
        orders = list(self.db.orders.find())
        client_ids = [order['orderCreatedBy'] for order in orders]
        item_ids = [item_id for order in orders for item_id in order['orderItems']]
        
        """ GET CLIENTS AND ORDER ITEMS """
        clients = {str(c['_id']) : c for c in self.db.clients.find({'_id': {'$in': client_ids}})}
        items = {str(i['_id']) : i for i in self.db.order_items.find({'_id': {'$in': item_ids}})}

        combined_orders = []
        for order in orders:
            client = clients.get(str(order['orderCreatedBy']))
            order_items = [items[str(item_id)] for item_id in order['orderItems'] if str(item_id) in items]

            if client and order_items:
                combined_orders.append({
                    "order": order,
                    "client": client,
                    "items": order_items
                })
        return combined_orders

python-dotenv could not parse statement starting at line 1
python-dotenv could not parse statement starting at line 7


In [37]:

class DocumentProcessor:
    
    def create_chunks(self, data: List[Dict]) -> List[Dict]:
        return [self._create_order_chunk(rec["order"], rec["client"], rec["items"]) for rec in data]

    def _create_order_chunk(self, order: Dict, client: Dict, items: List[Dict]) -> Dict:
        chunk_text = f"""
        [Order {order['_id']}] = 
        Status: "{order.get('status', 'N/A')}" ,
        Created Date: "{self._format_date(order.get('createdAt'))}" ,
        Contract: "{order.get('orderContractType', 'N/A')}" |
        
        [Client {client['_id']}] =
        Company: "{client.get('legalNameOfCompany', 'N/A')}" ,
        Client Name: "{client.get('contactFirstName', '')} {client.get('contactLastName', '')}" ,
        Email: "{client.get('contactEmail', 'N/A')}" ,
        Company Address: "{client.get('physicalAddressOfCompany', {}).get("address", "")}"
        Company City: "{client.get('physicalAddressOfCompany', {}).get("city", "")}"
        Preferences: ({', '.join(client.get('preferredCoffeeTypes', []))}) |
        
        [Items] = 
        {self._format_items(items)}
        """
        
        return {
            "text": self._clean_text(chunk_text),
            "metadata": {
                "order_id": str(order["_id"]),
                "client_id": str(client["_id"]),
                "status": order.get("status", "UNKNOWN"),
                "total_value": sum(item.get("price", 0) * item.get("totalAmount", 0) for item in items),
                "creation_date": str(order.get("createdAt", "")),
                "client_name": client.get("ownersFirstName", "") + " " + client.get("ownersLastName", "")
                 # "coffee_types": client.get("preferredCoffeeTypes", [])
            }
        }

    def _format_items(self, items: List[Dict]) -> str:
        return '\n'.join([
            f"Item {idx + 1} ID: \"{item.get('r_id', 'N/A')}\" , "
            f"Total Amount: \"{item.get('totalAmount', 0)}lbs\" , "
            f"Price: \"${item.get('price', 0)}\" , "
            f"Status: \"{item.get('status', 'N/A')}\" , "
            f"Last Updated: \"{self._format_date(item.get('updatedAt'))}\","
            for idx, item in enumerate(items)
        ])

    def _clean_text(self, text: str) -> str:
        return ' '.join(text.strip().split())

    def _format_date(self, date_str: str) -> str:
        try:
            return datetime.fromisoformat(date_str.replace('Z', '')).strftime("%Y-%m-%d %H:%M")
        except:
            return "Date not available"

In [30]:
data = MongoConnectionHandler().get_related_documents()
# data
processor = DocumentProcessor()
chunks = processor.create_chunks(data)
chunks

[{'text': '[Order 65cdf655acb24b003b673350] = Status: "CANCELLED" , Created Date: "Date not available" , Contract: "N/A" | [Client 65c22c47b062b6003c932497] = Company: "Cafe Kreyol, LLC" , Client Name: "Joseph Stazzone" , Email: "joey@cafekreyol.com" , Company Address: "10439 Balls Ford Road" Company City: "Manassas" Preferences: (Natural, Washed, Honey, Anerobic, Carbonic Maceration) | [Items] = Item 1 ID: "1-OrdIt1" , Total Amount: "0lbs" , Price: "$0" , Status: "REJECTED" , Last Updated: "Date not available", Item 2 ID: "1-OrdIt5" , Total Amount: "0lbs" , Price: "$0" , Status: "REJECTED" , Last Updated: "Date not available", Item 3 ID: "1-OrdIt4" , Total Amount: "0lbs" , Price: "$0" , Status: "REJECTED" , Last Updated: "Date not available", Item 4 ID: "1-OrdIt3" , Total Amount: "0lbs" , Price: "$0" , Status: "REJECTED" , Last Updated: "Date not available", Item 5 ID: "1-OrdIt0" , Total Amount: "0lbs" , Price: "$0" , Status: "REJECTED" , Last Updated: "Date not available", Item 6 ID:

In [None]:
class VectorStore:
    def __init__(self):
        self.embedder = embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="all-mpnet-base-v2"
        )
        self.client = chromadb.PersistentClient(path="./chroma_dbs")
        self.collection = self.client.get_or_create_collection(
            name="coffee_orders_processing",
            embedding_function=self.embedder,
            
        )
    
    def index_documents(self, chunks: List[Dict]):
        ids = [str(uuid.uuid4()) for _ in chunks]
        documents = [chunk['text'] for chunk in chunks]
        metadatas = [chunk['metadata'] for chunk in chunks]
        
        self.collection.add(
            ids=ids,
            documents=documents,
            metadatas=metadatas
        )
    
    def query(self, query: str, filters: Dict = None, n_results: int = 10) -> List[Dict]:
        results = self.collection.query(
            query_texts=[query],
            where=filters,
            n_results=n_results,
            include=["metadatas", "documents"]
        )
        return [{
            "text": doc,
            "metadata": meta
        } for doc, meta in zip(results["documents"][0], results["metadatas"][0])]


In [None]:


from sentence_transformers import SentenceTransformer
model=SentenceTransformer('all-mpnet-base-v2')
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]
embeddings = model.encode(sentences)
embeddings
# from chromadb.utils import embedding_functions
# df_e = embedding_functions.SentenceTransformerEmbeddingFunction(model_name='all-mpnet-base-v2')
# value = df_e(['Hello world'])
# value

Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.7.0-cp39-cp39-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.6.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers)
  Using cached networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.6.77-

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

array([[-0.05405934, -0.01718493, -0.03553742, ...,  0.02126809,
         0.04306812, -0.0107799 ],
       [-0.06346931, -0.0316185 , -0.01481563, ...,  0.03250144,
        -0.02075411,  0.00200078],
       [-0.03902554,  0.03801628,  0.00174587, ..., -0.01257586,
        -0.05982951, -0.04102634]], dtype=float32)

In [45]:
class CoffeeRAG:
    def __init__(self):
        self.mongo = MongoConnectionHandler()
        self.processor = DocumentProcessor()
        self.vector_store = VectorStore()
        self.llm = OllamaLLM(model="gemma3")
        
        self.prompt = ChatPromptTemplate.from_template(
            """As a coffee order analyst, answer using this context:
            Today: {current_date}
            Context:
            {context}
            
            Question: {question}
            
            Format response with appropriately
            And if no answer is available respond with "Answer not found, contact our support personal via hello@arkenacoffee.com" """
        )
    
    def initialize(self):
        """Load data from MongoDB and index it"""
        data = self.mongo.get_related_documents()
        chunks = self.processor.create_chunks(data)
        self.vector_store.index_documents(chunks)
    
    def retrieve(self, query: str) -> List[Dict]:
        return self.vector_store.query(query)
    
    def generate(self, query: str) -> str:
        context = self.retrieve(query)
        print(f"Context extracted: {context}")
        context_str = "\n\n---\n\n".join([f"{r['text']}\nMetadata: {r['metadata']}" for r in context])
        
        chain = (
            {"context": lambda _: context_str,
             "question": RunnablePassthrough(),
             "current_date": lambda _: datetime.now().strftime("%Y-%m-%d %H:%M")}
            | self.prompt
            | self.llm
            | StrOutputParser()
        )
        return chain.invoke(query)

In [None]:
if __name__ == "__main__":
    # Initialize components
    processor = DocumentProcessor()
    rag = CoffeeRAG()
    
    # Load and process data (run once)
    raw_data = MongoConnectionHandler().get_related_documents()
    chunks = processor.create_chunks(raw_data)
    rag.vector_store.index_documents(chunks)
    
    # Example queries
    queries = [
        "Which client has the most Completed order?"
    ]
    
    for query in queries:
        print(f"\nQuery: {query}")
        print("Answer:")
        print(rag.generate(query))
        print("\n" + "="*50)


Query: Find orders where the company is based in the US
Answer:
Context extracted: [{'text': '[Order 65d238d4acb24b003b6733a4] = Status: "CANCELLED" , Created Date: "Date not available" , Contract: "N/A" | [Client 65c22c47b062b6003c932497] = Company: "Cafe Kreyol, LLC" , Client Name: "Joseph Stazzone" , Email: "joey@cafekreyol.com" , Company Address: "10439 Balls Ford Road" Company City: "Manassas" Preferences: (Natural, Washed, Honey, Anerobic, Carbonic Maceration) | [Items] = Item 1 ID: "2-OrdIt1" , Total Amount: "0lbs" , Price: "$0" , Status: "REJECTED" , Last Updated: "Date not available", Item 2 ID: "2-OrdIt0" , Total Amount: "0lbs" , Price: "$0" , Status: "REJECTED" , Last Updated: "Date not available", Item 3 ID: "2-OrdIt2" , Total Amount: "0lbs" , Price: "$0" , Status: "REJECTED" , Last Updated: "Date not available",', 'metadata': {'total_value': 0, 'client_name': 'Joseph Stazzone', 'client_id': '65c22c47b062b6003c932497', 'status': 'CANCELLED', 'order_id': '65d238d4acb24b003b