<a href="https://colab.research.google.com/github/krixhnaprasad/DefBotAI/blob/main/DefBotAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pymupdf pillow pytesseract


In [None]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
import io
import os

# Optional: Set tesseract path manually if needed
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def enhance_image_for_ocr(image):
    """
    Enhance image for improved OCR accuracy:
    - Convert to grayscale
    - Apply adaptive thresholding
    - Increase contrast
    """
    gray = image.convert("L")
    gray = gray.filter(ImageFilter.MedianFilter())
    enhanced = ImageEnhance.Contrast(gray).enhance(2.5)
    thresholded = ImageOps.autocontrast(ImageOps.invert(enhanced))
    return thresholded

def extract_text_and_ocr_from_pdf(pdf_path):
    """
    Extract full page text and OCR text from images in the PDF
    """
    doc = fitz.open(pdf_path)
    combined_text = ""

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()
        combined_text += f"\n[Page {page_num + 1} Text]\n{text.strip()}\n"

        images = page.get_images(full=True)
        for img_index, img in enumerate(images):
            try:
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
                enhanced_image = enhance_image_for_ocr(image)
                ocr_text = pytesseract.image_to_string(enhanced_image)
                combined_text += f"\n[Page {page_num + 1} Image {img_index + 1} OCR Text]\n{ocr_text.strip()}\n"
            except Exception as e:
                combined_text += f"\n[Image OCR Failed on Page {page_num + 1} Image {img_index + 1}]: {str(e)}\n"

    doc.close()
    return combined_text.replace('"', "'").replace('\n', ' ').strip()

def create_modelfile(pdf_path, output_file="Modelfile.txt", base_model="llama3"):
    """
    Generate a complete Modelfile.txt from a PDF using both text and OCR image data
    """
    print("🔍 Extracting text and OCR content from PDF...")
    context = extract_text_and_ocr_from_pdf(pdf_path)

    print("📝 Writing Modelfile content...")
    modelfile_content = f"""FROM {base_model}

SYSTEM You are a highly knowledgeable and helpful assistant. The following content is extracted from a PDF document and may include both typed and scanned image-based data. Use it to answer user questions accurately and contextually.

SYSTEM {context}

PARAMETER temperature 0.7
PARAMETER top_p 0.9
PARAMETER num_ctx 4096

TEMPLATE \"\"\"{{{{ if .System }}}}System: {{{{ .System }}}}{{{{ end }}}}
{{{{ if .Prompt }}}}User: {{{{ .Prompt }}}}{{{{ end }}}}
Assistant:\"\"\"
"""

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(modelfile_content)

    print(f"✅ Modelfile created at: {os.path.abspath(output_file)}")

# --------------------------
# 📌 Run Example
# --------------------------
if __name__ == "__main__":
    pdf_input_path = input("📂 Enter the path to your PDF file: ").strip()
    create_modelfile(pdf_input_path)
