In [4]:
import os
os.getenv("OPENAI_API_KEY")

from dotenv import load_dotenv
load_dotenv(dotenv_path=".env", override=True)

True

In [None]:
# from utils.amazon_ocr import TextractPDFProcessor

# # 1. Initialize processor
# processor = TextractPDFProcessor(
#     region_name="ap-south-1",
#     poll_interval=5
# )

# # 2. Define S3 location of MCA PDF
# bucket_name = "textract-input-happy"
# object_key = "CONCOR Agreement for operations_compressed.pdf"

# # 3. Process PDF
# pagewise_output = processor.process_pdf(
#     bucket=bucket_name,
#     key=object_key
# )

# # 4. Use the output (example)
# for page_no, lines in pagewise_output.items():
#     print(f"\n===== PAGE {page_no} =====")
#     for line in lines:
#         print(line["text"])


In [None]:
from pdf2image import convert_from_path

def pdf_to_images(pdf_path):
    images = convert_from_path(pdf_path, dpi=300)
    return images

import pytesseract
import cv2
import numpy as np
from PIL import Image

def preprocess_image(img: Image.Image):
    img = np.array(img)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return gray

def ocr_image(img):
    config = "--oem 3 --psm 6"
    text = pytesseract.image_to_string(img, config=config)
    return text

def extract_text_from_scanned_pdf(pdf_path):
    images = pdf_to_images(pdf_path)
    full_text = ""

    for i, img in enumerate(images):
        processed = preprocess_image(img)
        page_text = ocr_image(processed)
        full_text += f"\n\n--- Page {i+1} ---\n"
        full_text += page_text

    return full_text


In [None]:
pdf_path = "CONCOR Agreement for operations_compressed.pdf"
text = extract_text_from_scanned_pdf(pdf_path)

with open("output.txt", "w", encoding="utf-8") as f:
    f.write(text)

print("OCR completed.")


PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?

# high 2026 OCR

### Run below cell to install required libs

In [None]:
# !pip install transformers torch
# !pip install pdf2image pytesseract opencv-python numpy pillow openai

In [None]:
from pdf2image import convert_from_path
import pytesseract, cv2, numpy as np
from PIL import Image
from openai import OpenAI
from tqdm import tqdm

client = OpenAI()

# ---------------- PDF to Images ----------------
def pdf_to_images(pdf_path):
    return convert_from_path(pdf_path, dpi=300)

# ---------------- Image Preprocessing ----------------
def preprocess_image(img: Image.Image):
    img = np.array(img)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return gray

# ---------------- Tesseract OCR ----------------
def ocr_image(img):
    config = "--oem 3 --psm 6"
    return pytesseract.image_to_string(img, config=config)

# ---------------- GPT Cleanup ----------------
def gpt_clean_page(text):
    prompt = f"""
You are cleaning OCR errors in a scanned legal document.
Fix spelling, broken words, wrong line breaks.
Do NOT change meaning or delete clauses.
only output the cleaned text.
no additional commentary.

TEXT:
{text}
"""
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=1
    )
    return response.choices[0].message.content

# ---------------- Full Pipeline ----------------
def extract_text_from_scanned_pdf(pdf_path):
    images = pdf_to_images(pdf_path)
    full_text = """"""

    for i, img in enumerate(tqdm(images, desc="Processing pages")):
        processed = preprocess_image(img)
        raw_text = ocr_image(processed)
        clean_text = gpt_clean_page(raw_text)

        full_text += f"\n\n--- Page {i+1} ---\n"
        full_text += clean_text

    return full_text

# # ---------------- Run ----------------
# if __name__ == "__main__":
#     pdf_path = "CONCOR Agreement for operations_compressed.pdf"

#     final_text = extract_text_from_scanned_pdf(pdf_path)

#     with open("output_clean.txt", "w", encoding="utf-8") as f:
#         f.write(final_text)

#     print("OCR + GPT cleanup completed to output_clean.txt")


In [None]:
from pdf2image import convert_from_path
import pytesseract, cv2, numpy as np
from PIL import Image
from openai import OpenAI
from groq import Groq
from tqdm import tqdm

# client = OpenAI()
client = Groq(api_key="groq_api_key")

# ---------------- PDF to Images ----------------
def pdf_to_images(pdf_path):
    return convert_from_path(pdf_path, dpi=300)

# ---------------- Image Preprocessing ----------------
def preprocess_image(img: Image.Image):
    img = np.array(img)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return gray

# ---------------- Tesseract OCR ----------------
def ocr_image(img):
    config = "--oem 3 --psm 6"
    return pytesseract.image_to_string(img, config=config)

# ---------------- GPT Cleanup ----------------
def gpt_clean_page(text):
    prompt = f"""
You are cleaning OCR errors in a scanned legal document.
Fix spelling, broken words, wrong line breaks.
Do NOT change meaning or delete clauses.
only output the cleaned text.
no additional commentary.

TEXT:
{text}
"""
    response = client.chat.completions.create(
        model="gpt-4o",
        # model="llama-3.1-8b-instant",
        messages=[{"role": "user", "content": prompt}],
        temperature=1
    )
    return response.choices[0].message.content

# ---------------- Full Pipeline ----------------
def extract_text_from_scanned_pdf(pdf_path):
    images = pdf_to_images(pdf_path)
    full_text = ""

    for i, img in enumerate(tqdm(images, desc="Processing pages")):
        processed = preprocess_image(img)
        raw_text = ocr_image(processed)
        # clean_text = gpt_clean_page(raw_text)

        full_text += f"\n\n--- Page {i+1} ---\n"
        full_text += raw_text

    return full_text


# ---------------- Run ----------------
if __name__ == "__main__":
    pdf_path = "GCT Pathri_Concor_compressed.pdf"

    final_text = extract_text_from_scanned_pdf(pdf_path)
    # cleaned_text = gpt_clean_page(final_text)

    # with open("output_clean_fullClean.txt", "w", encoding="utf-8") as f:
    #     f.write(final_text)

    # print("OCR + GPT cleanup completed to output_clean.txt")

Processing pages: 100%|██████████| 35/35 [01:02<00:00,  1.80s/it]


In [None]:
import tiktoken

def chunk_text(text, max_tokens=2000, model="gpt-4"):
    try:
        enc = tiktoken.encoding_for_model(model)
    except KeyError:
        # For models not recognized by tiktoken (like Groq's Llama), use a standard encoding
        enc = tiktoken.get_encoding("cl100k_base")
    
    tokens = enc.encode(text)

    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i:i+max_tokens]
        chunks.append(enc.decode(chunk_tokens))

    return chunks

def gpt_clean_chunks(text, model="gpt-4"):
    chunks = chunk_text(text, max_tokens=2000, model=model)
    cleaned_chunks = []

    for i, chunk in enumerate(tqdm(chunks, desc="Cleaning chunks")):
        prompt = f"""
You are cleaning OCR output from a scanned legal agreement (Master Concession Agreement).

THIS IS NOT A SUMMARY OR REWRITE TASK.

IMPORTANT CONTEXT:
- The input text MAY BE INCOMPLETE.
- The text may start or end mid-sentence or mid-clause.
- The text may contain OCR-induced structural errors.

STRUCTURAL WARNING (CRITICAL):
- ARTICLE numbers or clause numbers may be missing, incorrect, or inconsistent due to OCR.
- Do NOT renumber, reorder, normalize, or "correct" ARTICLE or clause numbering.
- Preserve numbering EXACTLY as it appears in the input.
- Do NOT infer the correct ARTICLE number even if it seems obvious.

EXAMPLE (DO NOT FIX STRUCTURE):
If the input contains:
"ARTICLES
SCOPE OF CONCESSION

4.1 Concession
...
3.1.3 The right to operate...
...
44.4 Rate of Rail Terminal"

Even if it appears that:
- This section should be "ARTICLE 3 – SCOPE OF CONCESSION"
- "44.4" likely means "4.4" or "3.4.4"

YOU MUST:
- Keep "ARTICLES SCOPE OF CONCESSION" exactly as written
- Keep clause numbers (4.1, 3.1.3, 44.4) unchanged
- Only clean spelling, broken words, and incorrect line breaks

YOU MUST NOT:
- Insert a missing ARTICLE number
- Renumber clauses
- Correct clause sequencing
- Replace "44.4" with a guessed number


PRIMARY GOAL:
Correct OCR noise while preserving the legal structure and meaning EXACTLY as provided.

DOCUMENT STRUCTURE:
- ARTICLE headings appear as: "ARTICLE 1", "ARTICLE 2", etc.
- Clauses are numbered hierarchically: 1.1, 1.1.1, 4.6.2, etc.
- Clause numbers and ARTICLE headings are authoritative anchors.

STRICT RULES (NON-NEGOTIABLE):
- Do NOT add, infer, reconstruct, or complete missing text.
- Do NOT guess how a sentence, clause, or ARTICLE continues.
- Do NOT merge content across clause boundaries.
- Do NOT split clauses into new ones.
- Do NOT change clause numbering or ordering.
- Do NOT modify defined terms or legal capitalization.

ALLOWED CLEANING ACTIONS ONLY:
- Fix OCR spelling errors.
- Join words broken due to line breaks.
- Remove incorrect line breaks inside the SAME sentence or clause.
- Preserve paragraph breaks BETWEEN clauses.
- Leave truncated sentences AS-IS without attempting to complete them.
- Remove special tokens or annotations.

OUTPUT REQUIREMENTS:
- Output ONLY the cleaned legal text.
- Preserve ARTICLE headings and clause numbers exactly.
- Do NOT include explanations, comments, or formatting markers.


TEXT TO CLEAN:
{chunk}
"""
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7
        )
        cleaned_chunks.append(response.choices[0].message.content)

    return "\n".join(cleaned_chunks)

cleaned_text = gpt_clean_chunks(final_text, model="llama-3.1-8b-instant")

Cleaning chunks: 100%|██████████| 4/4 [03:17<00:00, 49.39s/it]


In [None]:
from utils.ocr_clean import OCRCleaner

ocr = OCRCleaner(
    groq_api_key="groq_key",
    model="llama-3.1-8b-instant"
)

final_text = ocr.run("mca_pdf/GCT Pathri_Concor_compressed.pdf")

with open("output_clean__exp1.txt", "w", encoding="utf-8") as f:
    f.write(final_text)


OCR Pages: 100%|██████████| 35/35 [01:04<00:00,  1.83s/it]
Cleaning Chunks: 100%|██████████| 10/10 [04:14<00:00, 25.46s/it]


In [None]:
with open("output_clean_chunked.txt","w", encoding="utf-8") as f:
    f.write(cleaned_text)

In [None]:
# read output_clean_chunked.txt
with open("out")

In [None]:
# cleaned_text = gpt_clean_chunks(final_text, model="openai-gpt-oss-120b")

In [None]:
CLAUSE_BOUNDARY_PROMPT = """
You are a legal clause boundary detection engine.

TASK:
Identify clause boundaries in the given legal text.

OUTPUT REQUIREMENTS:
Return STRICTLY valid JSON.
No markdown.
No explanations.
No comments.

OUTPUT FORMAT:
{
  "clauses": [
    {
      "clause_number": "string",
      "start_index": integer,
      "end_index": integer
    }
  ]
}

RULES:
- Detect clauses using numbering patterns like:
  1
  1.1
  1.1.1
  2
  2.3
  10
- Clause numbering must match EXACTLY as in the text.
- start_index and end_index are CHARACTER OFFSETS in the input text.
- end_index must be exclusive.
- Do NOT extract clause text.
- Do NOT infer missing clauses.
- Do NOT merge clauses.
- Ignore page numbers, headers, footers, watermarks.

INPUT TEXT:
<<<TEXT>>>
"""

def build_clause_boundary_prompt(text: str) -> str:
    return CLAUSE_BOUNDARY_PROMPT.replace("<<<TEXT>>>", text)


In [None]:
import tiktoken

def token_len(text, model="cl100k_base"):
    enc = tiktoken.get_encoding(model)
    return len(enc.encode(text))

def chunk_with_overlap(text, max_tokens=3000, overlap_tokens=200):
    enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(text)

    chunks = []
    start = 0

    while start < len(tokens):
        end = start + max_tokens
        chunk_tokens = tokens[start:end]
        chunks.append(enc.decode(chunk_tokens))
        start = end - overlap_tokens

    return chunks


In [None]:
import json
import re

def deduplicate_and_sort_clauses(clauses):
    seen = {}
    for c in clauses:
        key = (c["clause_number"], c["start_index"])
        if key not in seen:
            seen[key] = c

    result = list(seen.values())

    result.sort(
        key=lambda x: [int(p) if p.isdigit() else p for p in x["clause_number"].split(".")]
    )

    return result


def extract_json_from_response(text):
    """Extract JSON from LLM response, handling markdown code blocks."""
    # Try to find JSON in markdown code blocks first
    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
    if json_match:
        return json_match.group(1)
    
    # Try to find raw JSON object
    json_match = re.search(r'\{.*\}', text, re.DOTALL)
    if json_match:
        return json_match.group(0)
    
    return text


def detect_clause_boundaries(text, client, model):
    chunks = chunk_with_overlap(text)
    all_clauses = []

    for i, chunk in enumerate(tqdm(chunks,desc="processing the chunks in document")):
        prompt = build_clause_boundary_prompt(chunk)

        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0
        )
        
        response_text = response.choices[0].message.content
        
        try:
            # Try to extract JSON from response
            json_str = extract_json_from_response(response_text)
            chunk_result = json.loads(json_str)
        except json.JSONDecodeError as e:
            print(f"Warning: Could not parse JSON from chunk {i}. Error: {str(e)[:100]}")
            print(f"Response preview: {response_text[:200]}")
            continue

        if "clauses" in chunk_result:
            for c in chunk_result["clauses"]:
                all_clauses.append(c)

    return deduplicate_and_sort_clauses(all_clauses)

In [None]:
final_text01 = detect_clause_boundaries(cleaned_text, client, model="llama-3.1-8b-instant")

Response preview: {
  "clauses": [
    {
      "clause_number": "1",
      "start_index": 0,
      "end_index": 5
    },
    {
      "clause_number": "1",
      "start_index": 6,
      "end_index": 12
    },
    {
    
Response preview: {
  "clauses": [
    {
      "clause_number": "1",
      "start_index": 0,
      "end_index": 5
    },
    {
      "clause_number": "1.1",
      "start_index": 6,
      "end_index": 11
    },
    {
  
Response preview: {
  "clauses": [
    {
      "clause_number": "1",
      "start_index": 0,
      "end_index": 6
    },
    {
      "clause_number": "2",
      "start_index": 7,
      "end_index": 13
    },
    {
    


In [None]:
cleaned_text[1291:1296]

'argo '

In [None]:
final_text01

[{'clause_number': '1', 'start_index': 0, 'end_index': 5},
 {'clause_number': '2', 'start_index': 6, 'end_index': 13},
 {'clause_number': '3', 'start_index': 246, 'end_index': 249},
 {'clause_number': '3.1', 'start_index': 1045, 'end_index': 1050},
 {'clause_number': '4', 'start_index': 1101, 'end_index': 1104},
 {'clause_number': '4.1', 'start_index': 1105, 'end_index': 1110},
 {'clause_number': '4.1.1', 'start_index': 1121, 'end_index': 1126},
 {'clause_number': '4.1.2', 'start_index': 1131, 'end_index': 1136},
 {'clause_number': '4.1.3', 'start_index': 1141, 'end_index': 1146},
 {'clause_number': '4.2', 'start_index': 1151, 'end_index': 1156},
 {'clause_number': '4.2.1', 'start_index': 1163, 'end_index': 1168},
 {'clause_number': '4.2.24', 'start_index': 1231, 'end_index': 1236},
 {'clause_number': '4.3', 'start_index': 1241, 'end_index': 1246},
 {'clause_number': '4.5', 'start_index': 1261, 'end_index': 1266},
 {'clause_number': '4.5.1', 'start_index': 1281, 'end_index': 1286},
 {'

In [None]:
with open("struct_text.txt","w", encoding="utf-8") as f:
    f.write(struct_text)

# .txt to json structured output

In [None]:
def load_text(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()
    
import re

ARTICLE_PATTERN = re.compile(
    r"(ARTICLE\s+[IVX0-9#]+)(.*?)((?=ARTICLE\s+[IVX0-9#]+)|\Z)",
    re.DOTALL | re.IGNORECASE
)

def extract_articles(text):
    articles = []
    for match in ARTICLE_PATTERN.finditer(text):
        article_id = match.group(1).strip()
        article_body = match.group(2).strip()
        articles.append({
            "article_id": article_id,
            "raw_text": article_body
        })
    return articles

def extract_article_title(article_text):
    lines = article_text.splitlines()
    for line in lines[:5]:
        if line.strip().isupper() and len(line.strip()) > 5:
            return line.strip()
    return None

CLAUSE_PATTERN = re.compile(
    r"(?P<id>\d{1,2}(\.\d+)+)\s+(?P<text>.*?)(?=(\n\d{1,2}(\.\d+)+\s)|\Z)",
    re.DOTALL
)

def extract_clauses(article_text):
    clauses = []
    for match in CLAUSE_PATTERN.finditer(article_text):
        clauses.append({
            "clause_id": match.group("id"),
            "raw_text": match.group("text").strip(),
            "clean_text": None,
            "confidence": None
        })
    return clauses

def build_structured_doc(text, document_id):
    articles_raw = extract_articles(text)
    structured = {
        "document_id": document_id,
        "articles": []
    }

    for art in articles_raw:
        title = extract_article_title(art["raw_text"])
        clauses = extract_clauses(art["raw_text"])

        structured["articles"].append({
            "article_id": art["article_id"],
            "title": title,
            "clauses": clauses
        })

    return structured

import json

def save_json(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

if __name__ == "__main__":
    text = load_text("output_clean.txt")
    structured_doc = build_structured_doc(
        text=text,
        document_id="Rail_MCA_2007"
    )
    save_json(structured_doc, "rail_mca_structured.json")



In [None]:
# law-ai/InLegalBERT

from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModel
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")
model = AutoModel.from_pretrained("law-ai/InLegalBERT", use_safetensors=True)

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/happy/Desktop/iimmu/cag_rake_uti/cag_env/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/happy/Desktop/iimmu/cag_rake_uti/cag_env/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/happy/Desktop/iimmu/cag_rake_uti/

In [None]:
import torch
import numpy as np
from sklearn.preprocessing import normalize

def embed_text(text, model, tokenizer, device="cpu"):
    """Generate embedding for a single text using InLegalBERT"""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    
    # Use [CLS] token embedding (first token of last hidden state)
    embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return normalize(embedding)[0]

ModuleNotFoundError: No module named 'sklearn'

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('law-ai/InLegalBERT')
model = AutoModel.from_pretrained('law-ai/InLegalBERT', use_safetensors=True)

def embed_text(text, model, tokenizer, device="cpu"):
    inputs = tokenizer(text, return_tensors="pt",truncation=True, max_length=512, padding=True)
    # inputs= {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def embed_documents(documents, model, tokenizer, device="cpu"):
    """Embed a list of documents using InLegalBERT"""
    embeddings = []
    for doc in documents:
        emb = embed_text(doc, model, tokenizer, device)
        embeddings.append(emb)
    return np.array(embeddings)

In [None]:
from langchain.embeddings.base import Embeddings
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.preprocessing import normalize

class InLegalBERTEmbeddings(Embeddings):
    def __init__(self, model_name="law-ai/InLegalBERT"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, use_safetensors=True)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
    
    def embed_documents(self, texts):
        embeddings = []
        for text in texts:
            inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = self.model(**inputs)
            embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(normalize(embedding)[0])
        return np.array(embeddings)
    
    def embed_query(self, text):
        return self.embed_documents([text])[0]

# Use it
embeddings = InLegalBERTEmbeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from langchain.embeddings.base import Embeddings
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.preprocessing import normalize

class InLegalBERTEmbeddings(Embeddings):
    """Custom embeddings using InLegalBERT"""
    
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")
        self.model = AutoModel.from_pretrained("law-ai/InLegalBERT", use_safetensors=True)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
    
    def embed_documents(self, texts):
        """Embed multiple documents"""
        embeddings = []
        for text in tqdm(texts):
            inputs = self.tokenizer(
                text, 
                return_tensors="pt", 
                truncation=True, 
                max_length=512, 
                padding=True
            )
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = self.model(**inputs)
            
            # Convert to numpy safely

            # Extract [CLS] token embedding
            embedding = outputs.last_hidden_state[:, 0, :].cpu().detach().numpy()
            embedding = normalize(embedding)[0]  # Shape: (768,)
            embeddings.append(embedding)

            # embedding = outputs.last_hidden_state[:, 0, :].detach()
            # embedding = torch.nn.functional.normalize(embedding, dim=1)
            # embedding = embedding.cpu()
            # embeddings.append(embedding)
        
        # return embeddings
        return np.array(embeddings, dtype=np.float32)
    
    def embed_query(self, text):
        """Embed a single query"""
        return self.embed_documents([text])[0]

# Reinitialize embeddings
embeddings = InLegalBERTEmbeddings()
print("✓ Embeddings loaded successfully!")

✓ Embeddings loaded successfully!


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from tqdm import tqdm
import os

txt_path = "output_clean_chunked.txt"
def load_documents(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()
    
def create_vectorDB(file_path, embeddings, db_path=None, chunk_size=1000, overlap=200):

    """_summary_

    Returns:
        _type_: _description_
    """
    text = load_documents(file_path)
    print("Documents loaded.")
    # Chunking
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = splitter.split_text(text)
    print(f"Total chunks created: {len(chunks)}")

    documents = [
        Document(page_content=chunk, 
                metadata={"source": os.path.basename(file_path), "chunk_id": i})
        for i, chunk in enumerate(tqdm(chunks))
    ]

    print("Embedding documents...")
    vector_store = FAISS.from_documents(documents, embeddings)
    if db_path:
        vector_store.save_local(db_path)
        print(f"Vector DB saved to {db_path}")
    return vector_store

In [None]:
vector_store = create_vectorDB(txt_path, embeddings, db_path="faiss_inlegalbert_db")

Documents loaded.
Total chunks created: 80


100%|██████████| 80/80 [00:00<00:00, 7931.93it/s]


Embedding documents...


100%|██████████| 80/80 [00:37<00:00,  2.11it/s]


Vector DB saved to faiss_inlegalbert_db


In [None]:
from utils.vecDb import InLegalMCAStore

store = InLegalMCAStore()

vector_db = store.build_from_file(
    "output_clean_chunked.txt",
    db_path="faiss_mca_db"
)

results = vector_db.similarity_search(
    "termination payment on authority default",
    k=5
)

for r in results:
    print(r.page_content)


Embedding docs: 100%|██████████| 80/80 [00:29<00:00,  2.74it/s]

} the responsibility of accident/derailment after enquiry is fixed on the
GCTO, ART Charges shall be payable by GCTO as prescribed by
Railway from time to time. The due Charges will be deducted from
} the future payments by Railway to GCTO. However, in case no
payment is being made by Railway to GCTO, the GCTO shall pay ART
Charges to Railway without
con ducting any accident enquiry as mentioned in para 9.2.3 above shall be
jointly by the representatives of Railway and GCTO to be
the ed by DRM and GCTO respectively. The accepting authority of
f enquiry Report shall be DRM, whose decision shall be final and
binding on GCTO.
10. C&W Maintenance Facilities
10.1 Normally C&W facilities shall not be constructed at GCTs.
10.2 However, if C&W facilities are operationally required at any GCT
as per the extant instructions, only one-time capital cost for setting-up
these facilities shall be borne by the GCTO, Operational costs, Including
4.2.1 The GCTO owner, in order to expedite commissioning 




In [None]:
from utils.vecDb import LegalVectorDB, SECTION_QUERIES
# =========================
# EXAMPLE USAGE
# =========================

if __name__ == "__main__":
    """
    Example flow:
    - Load document text
    - Ingest
    - Build index
    - Retrieve section-wise chunks
    """

    vecdb = LegalVectorDB()

    # Example: load text (replace with PDF-to-text output)
    with open("extracted_text/Adani  Mundra Port agreement for operations.txt", "r", encoding="utf-8") as f:
        text = f.read()

    vecdb.ingest_document(text, source_name="PPP_Agreement")
    vecdb.build_index()

    # Retrieve for Section 6: Tariff & Revenue Flexibility
    section_id = 6
    query = SECTION_QUERIES[section_id]

    retrieved_chunks = vecdb.retrieve_for_section(query, top_k=10)

    for doc in retrieved_chunks:
        print("SOURCE:", doc.metadata)
        print(doc.page_content[:1000])
        print("=" * 80)


SOURCE: {'source': 'PPP_Agreement', 'chunk_id': 178}
leviable to all types of container traffic as per extantirules.
2.3
Empty containers, when moved in privately owned wagons shall be charged at 65% of the
rates for loaded single deck 20 Tonnes container rute. The rates are given in Rate Table at
Annexure-L
2,4
In case of Double stack container train operation in privately owned wagons, containers in
the upper stuck, whether loaded or empty, shall be charged at 50% of the normal rate and
lower stack containers will be charged as per normal tariff.
2.5
Privately owned empty flat wagons shall be charged at 60% of the rates for
loaded single deck 20 Tonnes container rate.
2.6
There shall be no recovery from operator for maintenance of wagons.
&
Q
2.7
All extant Commercial Rules in vogue regarding levy of punitive charges for
overloading, penalty for mis-declaration, weighment etc. will be applicable to the
container traffic, provided if is not in contravention with any instruction
mentio

In [None]:
query = "what are the payment terms"
results = vector_store.similarity_search(query, k=3)
for i, res in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(res.page_content)


--- Result 1 ---
Freight on Through Distance Basis issued on 24.09.2014 (Annexure “A” of the policy),
as modified from time to time.
13.2 All new GCTs charged on a through distance basis shall be
governed by the Engine-on-Load policy (FM Circular No.5 of 2023 dated
07.03.2013}, as modified from time to time.
13.2.1 Railways may permit commissioning of a new GCT on non-EOL
basis with the approval of DRM, If Engine-on-Load scheme Is not
operationally feasible.
13.2.2 A new GCT/existing terminal migrating to GCT policy is
approved on non-through distance basis - then the maintenance of
assets on non-Railway land (except OHE) - shall continue to be the
responsibility of GCTO [as per provision of Para 7.3.4 of the GCT policy}.
14. Charging of Commercial Staff
14.1 No cost of commercial staff will be charged from the GCTO w.e.f.
the date of issue of this policy. However, for existing Terminals (where
PFT Private Siding Agreement has already entered into between AA and

--- Result 2 ---
the 

In [None]:
!brew install langchain-groq

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m==>[0m [1mAuto-updating Homebrew...[0m
Adjust how often this is run with `$HOMEBREW_AUTO_UPDATE_SECS` or disable with
`$HOMEBREW_NO_AUTO_UPDATE=1`. Hide these hints with `$HOMEBREW_NO_ENV_HINTS=1` (see `man brew`).
Installing from the API is now the default behaviour!
You can save space and time by running:
  brew untap homebrew/core
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
azure-dev: Developer CLI that provides commands for working with Azure resources
libthai: Thai language support library
pgroll: Postgres zero-downtime migrations made easy
rig-r: R Installation Manager
rv-r: Declarative R package manager
shiki: Beautiful yet powerful syntax highlighter
xcsift: Swift tool to parse xcodebuild output for coding agents
[34m==>[0m [1mNew Casks[0m
eigent: Desktop AI agent
font-zxgamut
hytale: Official Hytale Launcher
stremioservice: Companion app for Stremio Web

You have [1m7[0m outdated 

In [None]:
from langchain.chains import RetrievalQA
# from langchain_groq import ChatGroq as Groq
from langchain import Groq
from langchain.prompts import PromptTemplate

def setup_rag_qa_chain(vector_store, api_key):
    """Setup RAG chain for Q&A"""
    
    llm = Groq(
        api_key=api_key,
        model="llama-3.1-8b-instant",
        temperature=0.7
    )
    
    prompt_template = PromptTemplate(
        input_variables=["context", "question"],
        template="""You are a legal expert analyzing a Master Concession Agreement.
Use the provided context to answer the question accurately.
If the answer is not in the context, say "The information is not available in the document."

Context:
{context}

Question: {question}

Answer:"""
    )
    
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        chain_type_kwargs={"prompt": prompt_template},
        return_source_documents=True
    )
    
    return qa_chain

ModuleNotFoundError: No module named 'langchain_groq'