In [None]:
import os
import pdfplumber
import requests
import json
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_core.documents import Document
from typing import List
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter, SpacyTextSplitter

os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""
token = os.environ["HUGGINGFACEHUB_API_TOKEN"]


def _clean_text(document_content):
    return " ".join(document_content.lower().strip().split())



In [None]:
def load_document():
    valid_pages: List[Document] = [] # Explicitly type hint for clarity

    column_names = [
        "pk_subscription",
        "fck_customer_account",
        "fck_subscription_plan",
        "ek_subscription_state",
        "ak_subscription_id",
        "ck_start_date",
        "end_date",
        "billing_interval",
        "subscription_price",
        "subscription_feature_price",
        "subscription_tracking_id",
        "cancellation_date",
        "cancellation_to_date",
        "created_by",
        "created_on",
        "updated_by",
        "updated_on",
    ]

    # Iterate through each column name and create a separate Document for it
    for i, col_name in enumerate(column_names):
        valid_pages.append(
            Document(
                page_content=col_name,
                metadata={"column_index": i, "column_name": col_name, "source": "schema_definition"}
            )
        )
    return valid_pages

# Example of how you would then use it
documents = load_document()
print("Documents loaded:", documents)

In [None]:
embeddings = HuggingFaceEndpointEmbeddings(
    model="sentence-transformers/all-mpnet-base-v2",
    task="feature-extraction",
    huggingfacehub_api_token=token,
)


In [None]:
print("embeddings: ",embeddings)
documents = load_document()



In [None]:
if not documents:
    print("No valid documents found to process. Exiting.")
else:
    print(f"Loaded {len(documents)} documents.")

    # Step 1: Split documents into smaller chunks (important for RAG and efficient retrieval) | # This helps in handling large documents and ensures that each chunk
    # is semantically coherent for better retrieval.
    
    text_splitter = SpacyTextSplitter(
        chunk_size=10,      # The maximum size of each chunk
        chunk_overlap=0,    # The overlap between chunks to maintain context
        # length_function=len,
        # is_separator_regex=False,
    )
    # The split_documents method takes a list of Document objects and returns
    # a new list of smaller Document objects.
    chunked_documents = text_splitter.split_documents(documents)
    print(f"Split documents into {len(chunked_documents)} chunks.")

In [None]:
print("Initializing in-memory ChromaDB. Data will NOT be persisted to disk.")
print("This means the database will be cleared when the script finishes.")

# KEY CHANGE: Removed persist_directory
vectordb = Chroma.from_documents(
    documents=chunked_documents,
    embedding=embeddings,
    # persist_directory is removed for in-memory
)
print("In-memory ChromaDB successfully initialized and documents are stored.")
print("You can now query your in-memory Chroma database.")


In [None]:
# The user's request in normal language.
query = "the cancel date is jun 15 and start date is jan 1 all are year 2025 make a json using this information"
print(f"\nPerforming similarity search for the query: '{query}'")
# Find the most relevant columns from our AI memory.
docs = vectordb.similarity_search(query, k=4)
print("Similarity search complete.")

if docs:
    print(f"\nFound {len(docs)} relevant document chunks:")
    for i, doc in enumerate(docs):
        print(f"\n--- Relevant Document Chunk {i+1} ---")
        print(f"Content: {doc.page_content}")

    # --- NEW CODE TO INTERACT WITH LOCAL LLM ---
    print("\n--- SENDING DATA TO LOCAL LLM ---")

    # 1. Gather the context from the retrieved documents
    context = "\n".join([doc.page_content for doc in docs])

    # 2. Define the system prompt and the user query
    system_prompt = "Your output MUST be a JSON object. Do not include any other text or explanation in your response. Based on the following database columns, identify the most relevant ones for the user's query."
    user_query = f"User Query: '{query}'\n\nRelevant Columns:\n{context}"

    # 3. Construct the full prompt for the model
    full_prompt = f"{system_prompt}\n\n{user_query}"

    # 4. Format the JSON payload for the API request
    api_url = "http://localhost:11434/api/generate"
    payload = {"model": "qwen3:0.6b", "prompt": full_prompt, "stream": False}

    print("\nSending the following payload to the local model:")
    print(json.dumps(payload, indent=2))

    try:
        # 5. Send the POST request to the local model
        response = requests.post(api_url, json=payload)
        response.raise_for_status()  # This will raise an exception for bad status codes (4xx or 5xx)

        # 6. Process the response
        response_json = response.json()

        # The actual generated content is often in a 'response' or 'content' key.
        model_output_str = response_json.get("response", "")

        print("\n--- RESPONSE FROM LOCAL LLM ---")
        print(f"Raw model output string: {model_output_str}")

        # --- NEW & IMPROVED JSON EXTRACTION LOGIC ---
        if not model_output_str:
            print("Model returned an empty response.")
        else:
            try:
                # Find the first occurrence of '{' and the last occurrence of '}'
                start_index = model_output_str.find("{")
                end_index = model_output_str.rfind("}")

                if start_index != -1 and end_index != -1 and end_index > start_index:
                    # Extract the potential JSON substring
                    json_substring = model_output_str[start_index : end_index + 1]

                    # Now, try to parse this cleaned substring
                    structured_output = json.loads(json_substring)

                    print("\nSuccessfully extracted and parsed JSON output:")
                    print(json.dumps(structured_output, indent=2))
                else:
                    print(
                        "\nError: Could not find a valid JSON object within the model's output."
                    )

            except json.JSONDecodeError:
                print(
                    "\nError: The model's output contained a string that looked like JSON, but was invalid."
                )
                print(f"Attempted to parse: {json_substring}")
    except Exception as e:
        print(f"An unexpected error occurred during JSON parsing: {e}")


else:
    print("No relevant documents found to pass to the local model.")

print("\n--- LOCAL LLM INTERACTION COMPLETE ---")