In [1]:
import fitz  # PyMuPDF
import yaml
import nltk
import uuid
import openai
import os
import chromadb

from chromadb.config import Settings
from openai import AzureOpenAI  #
from chromadb import PersistentClient
from nltk.tokenize import sent_tokenize


from openai import AzureOpenAI  # Make sure this import is correct
from sentence_transformers import SentenceTransformer
from nltk.tokenize.punkt import PunktSentenceTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI




nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\E_Jairath\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import nltk
print("Trying to locate 'punkt'...")
try:
    nltk.data.find('tokenizers/punkt')
    print("‚úÖ 'punkt' is installed correctly.")
except LookupError:
    print("‚ùå 'punkt' not found. Downloading now...")
    nltk.download('punkt')


Trying to locate 'punkt'...
‚ùå 'punkt' not found. Downloading now...


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\E_Jairath\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
yaml_file_path = "API_Cred-copy.yaml"

with open(yaml_file_path, "r") as file:
    config = yaml.safe_load(file)
# openai
API_KEY = config['Open_ai_credentails']['API_KEY']
RESOURCE_ENDPOINT = config['Open_ai_credentails']['RESOURCE_ENDPOINT']
MODEL = config['Open_ai_credentails']['MODEL']
API_VERSION = config['Open_ai_credentails']['API_VERSION']

In [4]:
# --- Step 1: Extract text from PDF ---
def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text


# --- Step 2: Chunk the text into manageable pieces ---
def split_text(text, max_chars=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chars
        chunk = text[start:end]
        chunks.append(chunk)
        start += max_chars - overlap
    return chunks

def smart_split_text(text, chunk_size=500, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    return splitter.split_text(text)


def semantic_split_text(text, max_chars=500, overlap=50):
    from nltk.tokenize.punkt import PunktSentenceTokenizer
    tokenizer = PunktSentenceTokenizer()
    sentences = tokenizer.tokenize(text)

    chunks = []
    chunk = ""
    for sentence in sentences:
        if len(chunk) + len(sentence) <= max_chars:
            chunk += " " + sentence
        else:
            chunks.append(chunk.strip())
            chunk = sentence
    if chunk:
        chunks.append(chunk.strip())
    return chunks

# --- Step 3: Embed the chunks using SentenceTransformer ---
def embed_chunks(chunks, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks, convert_to_numpy=True)
    return embeddings


def store_in_chromadb(chunks, embeddings, db_path="./chroma_store"):
    # ‚úÖ Updated client
    client = PersistentClient(path=db_path)

    # Get or create a collection
    collection = client.get_or_create_collection("pdf_docs")

    # Add data to the collection
    collection.add(
        documents=chunks,
        embeddings=[e.tolist() for e in embeddings],
        metadatas=[{"source": "my_pdf"} for _ in chunks],
        ids=[str(uuid.uuid4()) for _ in chunks]
    )

    print("‚úÖ Successfully stored into ChromaDB (new API)")

In [5]:
# --- MAIN PIPELINE ---
pdf_path = "Documentation of PPI (Customer useage patterns).pdf"  

print("üîç Extracting text...")
text = extract_text_from_pdf(pdf_path)

print("‚úÇÔ∏è Splitting into chunks...")
chunks = semantic_split_text(text)

print("üß† Embedding chunks...")
embeddings = embed_chunks(chunks)

print("üíæ Storing into ChromaDB...")
store_in_chromadb(chunks, embeddings)

print("üöÄ Done.")

üîç Extracting text...
‚úÇÔ∏è Splitting into chunks...
üß† Embedding chunks...
üíæ Storing into ChromaDB...
‚úÖ Successfully stored into ChromaDB (new API)
üöÄ Done.


In [6]:
# Load your persisted Chroma vector database
client = PersistentClient(path="./chroma_store")
# Access the collection you previously created
collection = client.get_collection("pdf_docs")

In [7]:
# Embed a user query using the same model used for indexing
embedding_model = 'multi-qa-MiniLM-L6-cos-v1' #fast, good for Q&A
model = SentenceTransformer(embedding_model)
query = "what is this document about ?"
query_embedding = model.encode([query])[0].tolist()

# Search the ChromaDB with that vector
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5  # top 3 most relevant chunks
)

# Print out the results
print("Top matching chunks:\n")
for doc, metadata in zip(results['documents'][0], results['metadatas'][0]):
    print(f"üìÑ Source: {metadata['source']}\nüß† Chunk:\n{doc}\n{'-'*50}")


Top matching chunks:

üìÑ Source: my_pdf
üß† Chunk:
Printer Performance Insights  
Customer Consumables Prediction  
 
Revision: 1  
Version: 1 
Document Number: Doc-000XXXX  
Author:  
 
 
Document Type: Technical report 
 
 
The Work In Progress watermark must be switched 
Based on Template: Doc-0011608 
 
 
on while this document is being edited. DO NOT EDIT THIS BOX 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Contents 
1.
--------------------------------------------------
üìÑ Source: my_pdf
üß† Chunk:
Printer Performance Insights  
Customer Consumables Prediction  
 
Revision: 1  
Version: 1 
Document Number: Doc-000XXXX  
Author:  
 
 
Document Type: Technical report 
 
 
The Work In Progress watermark must be switched 
Based on Template: Doc-0011608 
 
 
on while this document is being edited. DO NOT EDIT THIS BOX 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Contents 
1.
--------------------------------------------------
üìÑ Source: my_pdf
üß† Chunk:
Printer Performance Insights  
Cus

In [9]:
from openai import AzureOpenAI  # Make sure this import is correct

def answer_query(question, db_path="./chroma_store", similarity_threshold=0.85):
    # Step 1: Embed the user question
    model = SentenceTransformer(embedding_model)
    question_embedding = model.encode([question])[0].tolist()

    # Step 2: Load ChromaDB and query relevant document chunks
    client_db = PersistentClient(path=db_path)
    collection = client_db.get_collection("pdf_docs")

    results = collection.query(
        query_embeddings=[question_embedding],
        n_results=10,
        include=["documents", "distances"]
    )

    documents = results["documents"][0]
    distances = results["distances"][0]

    # Step 3: Check similarity threshold
    if len(distances) == 0 or distances[0] > similarity_threshold:
        return "No idea, this info is not in the documents."

    context = "\n\n".join(documents)

    # Step 4: Setup the chat messages
    messages = [
        {
            "role": "system",
            "content": (
                "Answer the question strictly using the context above and also tell the reference point of your answer. If the answer is not found in the context, say: 'No idea, this info is not in the documents.'"
            ),
        },
        {
            "role": "user",
            "content": f"Context:\n{context}\n\nQuestion: {question}",
            
        }
    ]

    # Step 5: Connect to Azure OpenAI
    client = AzureOpenAI(
        api_key=API_KEY,
        api_version=API_VERSION,
        azure_endpoint=RESOURCE_ENDPOINT
    )

    # Step 6: Get response from LLM
    response = client.chat.completions.create(
        model=MODEL,
        messages=messages,
        temperature=0.4,
        top_p=0.95,
        frequency_penalty=0.2,
        presence_penalty=0.1,
        max_tokens=1000
    )

    return response.choices[0].message.content

query = 'explain the procedure how the calcualtion of unique make up cartridges is done for each customer'

def main(query):
    question = query
    answer = answer_query(question)
    print("\nü§ñ Answer: ")
    print(answer)


if __name__ == "__main__":
    main(query)




ü§ñ Answer: 
The calculation of unique makeup cartridges for each customer is done through the following procedure:

1. **Data Filtering**: For each specified customer, the dataset is filtered to include only relevant data for that customer.

2. **Separation by Printer**: The filtered data is further separated by individual printers.

3. **Cumulative Count of Unique Installations**: Within each printer's data, a cumulative count of unique cartridge installations is calculated. This is achieved by detecting when a new cartridge identifier appears in the data.

4. **Marking Changes**: Each change in cartridge identifier (within the same printer and customer group) is marked accordingly.

5. **Cumulative Summation**: These changes are then cumulatively summed to compute the total number of makeup cartridges used by each printer over time.

6. **Identification of Reused Cartridges**: After establishing the baseline count, cartridges that are reused in more than one printer under the same