In [None]:
%pwd

In [None]:
%cd ..

In [None]:
from app.services.RAG_service import RAGService

rag = RAGService()

In [None]:
rag.load_and_split_document(type="pdf", path = r"app\notebooks\KshitijResume.pdf")

In [None]:
len(rag.chunks)

In [None]:
import os
token = os.getenv("HF_TOKEN")
token

In [None]:
from huggingface_hub import login
import os
login()


In [None]:
from huggingface_hub import HfFolder
print(HfFolder.get_token())
token = HfFolder.get_token()

In [None]:
## downloading gemma3 270m from huggingface

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="google/gemma-3-270m-it", token = "")

In [None]:
pipe("hi? ")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-270m")
model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m")

In [None]:
from langchain_huggingface import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="google/gemma-3-270m",
    task="text-generation"
)

In [None]:
llm = HuggingFacePipeline(pipeline=pipe)


In [None]:
from langchain.prompts import PromptTemplate

template = """Translate English to French: {text}"""
prompt = PromptTemplate(template=template, input_variables=["text"])

# Create a LangChain chain
chain = prompt | llm

# Run the chain
input_text = ""
response = chain.invoke({"text": "Can you give me structured outputs ?"})

print(response)


In [None]:
import requests

# Replace with your actual Space URL
url = "https://kshitijk20-ollama.hf.space/api/generate"

payload = {
    "model": "gemma3:270m",
    "prompt": "What are the benefits of small language models?",
    "stream": True
}

response = requests.post(url, json=payload)
# print(response.json().get("response"))
response


In [None]:


headers = {
    "Authorization": f"Bearer {hf_token}",
    "Content-Type": "application/json"
}

payload = {
    "model": "gemma3:270m",
    "prompt": "Hi",
    "stream": False  # Set to False for simple testing
}

response = requests.post(url, json=payload, headers=headers)

# CRITICAL: Print the text to see if the error is from Hugging Face or Ollama
if response.status_code == 404:
    print("Error Details:", response.text)
else:
    print(response.json().get("response"))


In [None]:
from langchain_community.llms import Ollama
import os

HF_TOKEN = os.getenv("HF_TOKEN")

llm = Ollama(
    base_url="https://kshitijk20-ollama.hf.space",
    model="gemma3:270m",
    headers={
        "Authorization": f"Bearer {HF_TOKEN}"
    }
)

print(llm.invoke("hi"))


In [None]:
from langchain_community.llms import Ollama
import os

HF_TOKEN = os.getenv("HF_TOKEN")

llm = Ollama(
    base_url="https://kshitijk20-ollama.hf.space",
    model="gemma3:1b",
    headers={
        "Authorization": f"Bearer {HF_TOKEN}"
    }
)

print(llm.invoke("hi"))


In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm2 = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    api_key=os.getenv("GEMINI_API_KEY")
)
llm.invoke("hi")


In [None]:
from langchain.chat_models import init_chat_model
from os import getenv
from dotenv import load_dotenv

load_dotenv()

# Initialize the model with OpenRouter's base URL
llm3 = init_chat_model(
    model="google/gemma-3-27b-it:free",
    model_provider="openai",
    base_url="https://openrouter.ai/api/v1",
    api_key=getenv("OPENROUTER_API_KEY"),
)

# Example usage
response = model.invoke("hi?")
print(response.content)


In [None]:
%pwd

In [None]:
%cd ..

In [None]:
from app.utils.model_loader import ModelLoader
from app.ingestion.file_loader import FileLoader
from app.ingestion.text_splitter import splitting_text
from app.retrieval.retriever import Retriever
from app.embedding.embeder import QueryEmbedding
from app.embedding.vectore_store import VectorStore
from app.metadata_extraction.metadata_ext import MetadataExtractor
from app.utils.metadata_utils import MetadataService
from langchain_core.documents import Document
import json
from langchain_community.retrievers import BM25Retriever
from langchain.schema import Document

# Global model instances (loaded once)
_embedding_model = None

def get_models():
    global  _embedding_model
    if _embedding_model is None:
        print("Loading models (one-time initialization)...")
        embedding_loader = ModelLoader(model_provider="huggingface")
        _embedding_model = embedding_loader.load_llm()
    return _embedding_model

class RAGService: 
    def __init__(self):
        print("[RAGService] Initializing service...")
        self._init_models()
        self.Docuement_Type = None 
        self.Pinecone_index = None
        self.Document_path = None
        self.Document_Type = None
        self.DocumentTypeScheme = None
        self.url = None
        self.chunks = None
        self.vector_store = None
        self.index = None
        self.namespace = None
        self.retriever = None
        self.metadataservice = MetadataService()
        print("[RAGService] Initialization complete.")

    def _init_models(self):
        """Initialize LLM and embedding Models"""
        print("[RAGService] Loading LLM model (openrouter)...")
        self.model_loader = ModelLoader(model_provider="openrouter")
        self.llm = self.model_loader.load_llm()
        # self.llm = llm3
        print("[RAGService] LLM model loaded.")
        print("[RAGService] Loading embedding model (huggingface)...")
        # self.model_loader = ModelLoader(model_provider="huggingface")
        self.embedding_model = get_models()
        print("[RAGService] Embedding model loaded.")

    def load_and_split_document(self, type:str, path:str= None, url:str = None):
        """Load and chunk document from local path or URL"""
        print(f"[RAGService] Loading document. Type: {type}, Path: {path}, URL: {url}")
        file_loader = FileLoader(llm = self.llm)
        if type == "pdf":
            if path:
                print(f"[RAGService] Loading PDF from path: {path}")
                doc = file_loader.load_pdf(path)
            elif url:
                print(f"[RAGService] Loading PDF from URL: {url}")
                doc = file_loader.load_documents_from_url(url)
            else:
                print("[RAGService] Error: Either path or url must be provided for PDF.")
                raise ValueError("Either path or url must be provided for PDF.")
        elif type == "word":
            if path:
                print(f"[RAGService] Loading Word document from path: {path}")
                doc = file_loader.load_word_document(path)
            elif url:
                print("[RAGService] Error: URL loading not supported for Word documents.")
                raise ValueError("URL loading not supported for Word documents.")
            else:
                print("[RAGService] Error: Path must be provided for Word document.")
                raise ValueError("Path must be provided for Word document.")
        else:
            print("[RAGService] Error: Unsupported document type.")
            raise ValueError("Unsupported document type. Use 'pdf' or 'word'.")
        
        print("[RAGService] Detecting document type scheme...")
        self.DocumentTypeScheme = file_loader.detect_document_type(doc[0:2])
        print(f"[RAGService] Document type scheme detected: {self.DocumentTypeScheme}")
        self.Document_Type = self.metadataservice.Return_document_model(self.DocumentTypeScheme)
        print(f"[RAGService] Document type model: {self.Document_Type}")
        ## 
        from datetime import datetime
        self.splitter = splitting_text(documentTypeSchema=self.Document_Type, llm=self.llm, embedding_model=self.embedding_model)
        print("[RAGService] Splitting document into chunks...")
        start_time = datetime.now()
        self.chunks = self.splitter.text_splitting(doc)
        end_time = datetime.now()
        print(f"[RAGService] Time taken to extract metadata with splitter: {end_time - start_time}")
        print(f"[RAGService] Total chunks created: {len(self.chunks)}")

    def create_query_embedding(self, query: str):
        print("[RAGService] Creating query embedding...")
        self.query = query
        self.query_embedder = QueryEmbedding(query=query, embedding_model=self.embedding_model)
        self.query_embedding = self.query_embedder.get_embedding()
        print(f"[RAGService] Query embedding created: {self.query_embedding}")
        langchain_doc = Document(page_content=query)
        print("[RAGService] Extracting metadata for the query...")
        self.metadataExtractor = MetadataExtractor(llm=self.llm)
        with open(self.splitter.Keywordsfile_path, "r") as f:
            known_keywords = json.load(f)
        raw_metadata = self.metadataExtractor.extractMetadata_query(self.Document_Type,langchain_doc, known_keywords = known_keywords)
        print(f"[RAGService] Query metadata extracted: {raw_metadata}")
        # Convert to dictionary and format for Pinecone
        metadata_dict = raw_metadata.model_dump(exclude_none=True)
        formatted_metadata = self.metadataservice.format_metadata_for_pinecone(metadata_dict)
        
        # Remove problematic fields that cause serialization issues
        self.query_metadata = {
            k: v for k, v in formatted_metadata.items() 
            if k not in ["obligations", "exclusions", "notes", "added_new_keyword"]
        }
    
        print(f"[RAGService] Query metadata type: {type(self.query_metadata)}")
        print(f"[RAGService] Query metadata: {self.query_metadata}")

    def create_vector_store(self):
        print("[RAGService] Creating vector store...")
        self.vector_store_class_instance = VectorStore(self.chunks, self.embedding_model)
        self.index, self.namespace, self.vector_store = self.vector_store_class_instance.create_vectorestore()
        print(f"[RAGService] Vector store created. Index: {self.index}, Namespace: {self.namespace}")
        ### Sparse Retriever(BM25)
        self.sparse_retriever=BM25Retriever.from_documents(self.chunks)
        self.sparse_retriever.k=3 ##top- k documents to retriever

        

    def retrive_documents(self, raw_query: str):
        print("[RAGService] Retrieving documents from vector store...")
        self.create_query_embedding(raw_query)
        
        self.retriever = Retriever(self.index,raw_query,self.query_metadata, self.namespace, self.vector_store,sparse_retriever = self.sparse_retriever,llm = self.llm)
        self.result = self.retriever.retrieval_from_pinecone_vectoreStore()
        # self.result = self.retriever.invoke(raw_query)
        # print(f"[RAGService] Retrieval result: {self.result}")
    
    def answer_query(self, raw_query:str) -> str:
        """Answer user query using retrieved documents and LLM"""
        print(f"[RAGService] Answering query: {raw_query}")
        # top_clause = self.result['matches']
        # top_clause_dicts = [r.to_dict() for r in top_clause]
        # self.top_clauses = top_clause_dicts
        # keys_to_remove = {"file_path", "source", "producer", "keywords", "subject", "added_new_keyword", "author", "chunk_id"}
        # for r in top_clause_dicts:
        #     meta = r.get("metadata", {})
        #     for k in keys_to_remove:
        #         meta.pop(k, None)

        # context_clauses = json.dumps(top_clause_dicts, separators=(",", ":"))
        context_clauses = [doc.page_content for doc in self.result]

        print(f"context_clauses: {context_clauses}")

        prompt = f"""
        You are a legal/insurance domain expert and policy analyst. 
        Use the following extracted clauses from policy documents to answer the question.  
        If you can't find the answer, say "I don't know".
        Context clauses:
        {"".join(context_clauses)}
        Question: {raw_query}
        """
        print("[RAGService] Invoking LLM with prompt...")
        response = self.llm.invoke(prompt)
        print(f"[RAGService] LLM response: {response}")
        
        # Extract string content from response object
        if hasattr(response, 'content'):
            return response.content
        elif isinstance(response, str):
            return response
        else:
            return str(response)

In [None]:
rag = RAGService()
rag.load_and_split_document(type="pdf", path = r"app\notebooks\KshitijResume.pdf")

In [None]:
print(rag.chunks[1].metadata)

In [22]:
chunk = """National Parivar Mediclaim Plus Policy
Whereas the Proposer designated in the schedule hereto has by a Proposal together with Declaration, which shall be the basis of
this contract and is deemed to be incorporated herein, has applied to National Insurance Company Ltd. (hereinafter called the
Company), for the insurance hereinafter set forth, in respect of person(s)/ family members named in the schedule hereto
(hereinafter called the Insured Persons) and has paid the premium as consideration for such insurance.
1 PREAMBLE
The Company undertakes that if during the Policy Period, any Insured Person shall suffer any illness or disease (hereinafter called
Illness) or sustain any bodily injury due to an Accident (hereinafter called Injury) requiring Hospitalisation of such Insured
Person(s) for In-Patient Care at any hospital/nursing home (hereinafter called Hospital) or for Day Care Treatment at any Day
Care Center or to undergo treatment under Domiciliary Hospitalisation, following the Medical Advice of a duly qualified Medical
Practitioner, the Company shall indemnify the Hospital or the Insured, Reasonable and Customary Charges incurred for Medically
Necessary Treatment towards the Coverage mentioned herein.
Provided further that, the amount payable under the Policy in respect of all such claims during each Policy Year of the Policy
Period shall be subject to the Definitions, Terms, Exclusions, Conditions contained herein and limits as shown in the Table of
Benefits, and shall not exceed the Floater Sum Insured in respect of the Insured family.
"""

In [3]:
!uv pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl

[2mUsing Python 3.12.4 environment at: C:\code\Claridoc\Rag_app\.venv[0m
[2mResolved [1m43 packages[0m [2min 5.85s[0m[0m
[36m[1mDownloading[0m[39m thinc [2m(1.4MiB)[0m
[36m[1mDownloading[0m[39m numpy [2m(14.8MiB)[0m
[36m[1mDownloading[0m[39m en-core-web-sm [2m(12.2MiB)[0m
 [32m[1mDownloading[0m[39m en-core-web-sm
 [32m[1mDownloading[0m[39m thinc
 [32m[1mDownloading[0m[39m numpy
[2mPrepared [1m5 packages[0m [2min 9.96s[0m[0m
[2mUninstalled [1m4 packages[0m [2min 2.28s[0m[0m
[2mInstalled [1m8 packages[0m [2min 2.00s[0m[0m
 [31m-[39m [1mblis[0m[2m==1.3.3[0m
 [32m+[39m [1mblis[0m[2m==0.7.11[0m
 [32m+[39m [1men-core-web-sm[0m[2m==3.7.1 (from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl)[0m
 [32m+[39m [1mlangcodes[0m[2m==3.5.1[0m
 [31m-[39m [1mnumpy[0m[2m==2.3.2[0m
 [32m+[39m [1mnumpy[0m[2m==1.26.4[0m
 [32m+[39m [1mshellingha

In [2]:
!uv run python -m spacy download en_core_web_sm

C:\code\Claridoc\Rag_app\.venv\Scripts\python.exe: No module named pip


In [23]:
import spacy

nlp = spacy.load("en_core_web_sm")
# nlp
# 

In [4]:
nlp

<spacy.lang.en.English at 0x1efcce195e0>

In [5]:
doc = nlp(chunk)

In [16]:
out = {
        "candidate_phrases": [],
        "parties": [],
        "jurisdiction": []
    }

In [None]:
for chunk in doc.noun_chunks:
    phrase = chunk.text.strip()
    if len(phrase) > 2 and not phrase.lower().startswith(("this", "those", "that", "these")):
        out["candidate_phrases"].append(phrase)
    
out

In [24]:
from collections import defaultdict

# how many documents processed
TOTAL_DOCS = 0

# phrase -> in how many docs it appeared
PHRASE_DOC_FREQ = defaultdict(int)


In [26]:
chunk

'National Parivar Mediclaim Plus Policy\nWhereas the Proposer designated in the schedule hereto has by a Proposal together with Declaration, which shall be the basis of\nthis contract and is deemed to be incorporated herein, has applied to National Insurance Company Ltd. (hereinafter called the\nCompany), for the insurance hereinafter set forth, in respect of person(s)/ family members named in the schedule hereto\n(hereinafter called the Insured Persons) and has paid the premium as consideration for such insurance.\n1 PREAMBLE\nThe Company undertakes that if during the Policy Period, any Insured Person shall suffer any illness or disease (hereinafter called\nIllness) or sustain any bodily injury due to an Accident (hereinafter called Injury) requiring Hospitalisation of such Insured\nPerson(s) for In-Patient Care at any hospital/nursing home (hereinafter called Hospital) or for Day Care Treatment at any Day\nCare Center or to undergo treatment under Domiciliary Hospitalisation, followi

In [27]:
import spacy
nlp = spacy.load("en_core_web_sm")


def extract_candidate_phrases(text: str):
    doc = nlp(text)
    phrases = []

    for chunk in doc.noun_chunks:
        head = chunk.root

        # ---- syntactic strength filter ----
        if head.pos_ not in {"NOUN", "PROPN"}:
            continue

        phrase = " ".join(chunk.text.split()).lower()

        # length sanity
        if len(phrase) < 4:
            continue

        phrases.append(phrase)

    return list(set(phrases))
def update_phrase_stats(phrases: list[str]):
    global TOTAL_DOCS
    TOTAL_DOCS += 1

    for p in set(phrases):
        PHRASE_DOC_FREQ[p] += 1
def is_informative_phrase(phrase: str, max_doc_ratio: float = 0.3):
    """
    Drops phrases that appear in too many documents.
    Equivalent to low-IDF phrases.
    """
    if TOTAL_DOCS == 0:
        return True

    doc_ratio = PHRASE_DOC_FREQ[phrase] / TOTAL_DOCS
    return doc_ratio <= max_doc_ratio
def filter_phrases_for_embeddings(phrases: list[str]):
    filtered = []

    for p in phrases:
        if is_informative_phrase(p):
            filtered.append(p)

    return filtered
def process_chunk(text: str):
    # 1. extract
    phrases = extract_candidate_phrases(text)

    # 2. update corpus stats
    update_phrase_stats(phrases)

    # 3. filter
    final_phrases = filter_phrases_for_embeddings(phrases)

    return final_phrases
print(process_chunk(chunk))


[]


In [None]:
from typing import Dict, List

def spacy_extract_from_chunk(text: str) -> Dict[str, List[str]]:
    """
    Extract candidate phrases + entities from ONE chunk using spaCy.
    This is meant to run before embeddings.
    """
    doc = nlp(text)

    out = {
        "candidate_phrases": [],
        "parties": [],
        "jurisdiction": []
    }

    return doc 




In [None]:
# ---- 1️⃣ Candidate phrases (noun chunks) ----
    for chunk in doc.noun_chunks:
        phrase = chunk.text.strip()
        # light cleaning
        if len(phrase) > 2 and not phrase.lower().startswith(("this", "that", "these", "those")):
            out["candidate_phrases"].append(phrase)

    # ---- 2️⃣ Named entities ----
    for ent in doc.ents:
        if ent.label_ in ("ORG", "PERSON"):
            out["parties"].append(ent.text)
        elif ent.label_ in ("GPE", "LOC"):
            out["jurisdiction"].append(ent.text)

    # ---- de-duplicate ----
    for k in out:
        out[k] = list(set(out[k]))

    return out