In [1]:
!pip -q install pypdf

In [10]:
from pypdf import PdfReader

def extract_pdf_text(pdf_path: str) -> str:
    reader = PdfReader(pdf_path)
    all_pages = []
    for i, page in enumerate(reader.pages):
        txt = page.extract_text() or ""
        all_pages.append(f"\n\n--- PAGE {i+1} ---\n{txt}")
    return "".join(all_pages).strip()

# ---- Use it ----
pdf_path = r"C:\Users\harsh.raj\OneDrive - Aster DM Healthcare\Codes\LangGraph\Practical LangGraph\Workflows\PDF extraction\sample Rx 2.pdf"  # change to your file path
text = extract_pdf_text(pdf_path)

print("✅ Characters extracted:", len(text))
print("\n✅ Preview :\n")
print(text)


✅ Characters extracted: 14

✅ Preview :

--- PAGE 1 ---


In [11]:
!pip -q install pymupdf

In [12]:
import fitz

def pdf_diagnose(pdf_path: str):
    doc = fitz.open(pdf_path)
    total_text = 0
    total_images = 0
    for page in doc:
        total_text += len(page.get_text("text") or "")
        total_images += len(page.get_images(full=True))
    print("Pages:", len(doc))
    print("Total extracted text chars:", total_text)
    print("Total embedded images:", total_images)

pdf_diagnose(pdf_path)


Pages: 1
Total extracted text chars: 0
Total embedded images: 6


In [16]:
!pip install pdf2image pytesseract pillow



In [4]:
!pip install langchain-community langchain-ollama pymupdf langchain-core



In [None]:
import json
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# 1. Setup the Model
llm = ChatOllama(
    model="llama3.2",
    temperature=0,  # Keep strict
    format="json" 
)

# 2. Load the PDF
pdf_path1 = r"C:\Users\harsh.raj\OneDrive - Aster DM Healthcare\Codes\LangGraph\Practical LangGraph\Workflows\PDF extraction\sample Rx 2.pdf"  
loader = PyMuPDFLoader(pdf_path) 
docs = loader.load()
raw_text = "\n".join([doc.page_content for doc in docs])

# --- DEBUGGING CHECK ---
if not raw_text.strip():
    print("ERROR: No text extracted! The PDF might be an image or empty.")
    exit()
else:
    print(f"Success: Extracted {len(raw_text)} characters.")
# -----------------------

# 3. Create a Stronger Prompt
prompt_template = """
You are a medical data extraction assistant. 
Your task is to extract specific information from the provided Prescription Text below.

### STRICT RULES:
1. Extract data ONLY from the "Prescription Text" section below.
2. Do NOT invent or hallucinate names (like "John Doe"). Use the actual names found in the text.
3. If a field is missing in the text, return null or an empty string.
4. Return the output as valid JSON.

### Prescription Text:
{text}

### Desired JSON Structure:
{{
  "patient_name": "string",
  "doctor_name": "string",
  "diagnosis_codes": ["string", "string"],
  "medicines": [
    {{
      "name": "string",
      "dosage_instruction": "string",
      "duration": "string"
    }}
  ]
}}
"""

# Note: We use {{ double braces }} to escape them in f-strings or prompt templates 
# where we don't want Python to interpret them as variables.

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["text"]
)

# 4. Build the Chain
chain = prompt | llm | StrOutputParser()

# 5. Run Extraction
try:
    print("Extracting structured data...")
    result = chain.invoke({"text": raw_text})
    
    # Parse and Print
    data = json.loads(result)
    print(json.dumps(data, indent=4))
    
except Exception as e:
    print(f"Error: {e}")

ERROR: No text extracted! The PDF might be an image or empty.
Extracting structured data...
{
    "patient_name": null,
    "doctor_name": null,
    "diagnosis_codes": [],
    "medicines": []
}


: 

In [8]:
# 1. Load the PDF
pdf_path1 = r"C:\Users\harsh.raj\OneDrive - Aster DM Healthcare\Codes\LangGraph\Practical LangGraph\Workflows\PDF extraction\sample Rx 1.pdf"  
import json
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# 1. Setup the Model
llm = ChatOllama(
    model="llama3.2",
    temperature=0,  # Temperature 0 is critical for data extraction
    format="json"   # Llama 3.2 supports native JSON mode
)

# 2. Load the PDF
# PyMuPDF is the best choice here because it reads the hidden text layers 
# in your digital PDF even if the layout is messy.
loader = PyMuPDFLoader(pdf_path1) 
docs = loader.load()
raw_text = "\n".join([doc.page_content for doc in docs])

# 3. Create a Targeted Prompt
# We ask the LLM to ignore the layout noise and find specific entities.
prompt_template = """
You are a medical data assistant. Extract details from the following prescription text.
Return ONLY a valid JSON object. Do not add any conversational text.

Extract these fields:
- patient_name
- doctor_name
- diagnosis_codes (as a list)
- medicines (as a list of objects with fields: name, dosage_instruction, duration)

Prescription Text:
{text}
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["text"]
)

# 4. Build the Chain
chain = prompt | llm | StrOutputParser()

# 5. Run Extraction
try:
    print("Extracting structured data...")
    result = chain.invoke({"text": raw_text})
    
    # Parse the string result into a real Python dictionary
    data = json.loads(result)
    
    # Print pretty JSON
    print(json.dumps(data, indent=4))
    
except Exception as e:
    print(f"Error: {e}")

Extracting structured data...
{
    "patient_name": "John Doe",
    "doctor_name": "Dr. Jane Smith",
    "diagnosis_codes": [
        "I10",
        "I11"
    ],
    "medicines": [
        {
            "name": "Aspirin",
            "dosage_instruction": "2 tablets daily",
            "duration": "for 3 months"
        },
        {
            "name": "Lisinopril",
            "dosage_instruction": "1 tablet once daily",
            "duration": "for 6 months"
        }
    ]
}


In [2]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
Collecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.2.0-py3-none-win_amd64.whl.metadata (67 kB)
Collecting cryptography>=36.0.0 (from pdfminer.six==20251230->pdfplumber)
  Downloading cryptography-46.0.3-cp311-abi3-win_amd64.whl.metadata (5.7 kB)
Collecting cffi>=2.0.0 (from cryptography>=36.0.0->pdfminer.six==20251230->pdfplumber)
  Downloading cffi-2.0.0-cp312-cp312-win_amd64.whl.metadata (2.6 kB)
Collecting pycparser (from cffi>=2.0.0->cryptography>=36.0.0->pdfminer.six==20251230->pdfplumber)
  Downloading pycparser-2.23-py3-none-any.whl.metadata (993 bytes)
Downloading pdfplumber-0.11.9-py3-none-any.whl (60 kB)
Downloading pdfminer_six-20251230-py3-none-any.whl (6.6 MB)
   ---------------------------------------- 0.0/6.6 MB ? eta -:--:--
   -------------

In [None]:
import os
import pdfplumber
import json
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# --- CONFIGURATION ---
pdf_path = r"C:\Users\harsh.raj\OneDrive - Aster DM Healthcare\Codes\LangGraph\Practical LangGraph\Workflows\PDF extraction\sample Rx 1.pdf"  
# ---------------------

# 1. Debug: Check if file exists
if not os.path.exists(pdf_path):
    print(f"❌ ERROR: File not found at: {os.path.abspath(pdf_path)}")
    print("Please check the file name or provide the full path (e.g., C:/Users/Name/Downloads/sample Rx 1.pdf)")
    exit()

# 2. Extract Text using PDFPlumber (More robust than PyMuPDF)
print(f"Reading file: {pdf_path}...")
raw_text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            raw_text += text + "\n"

# 3. Check what we found
print(f"--- DEBUG: Extracted {len(raw_text)} characters ---")
if len(raw_text) < 50:
    print("⚠️ WARNING: Very little text found. The PDF might be scanned (image-based).")
    print("If the text below is empty, we need to use OCR.")
    print("-" * 20)
    print(raw_text)
    print("-" * 20)
    if not raw_text.strip():
        exit()

# 4. Setup LLM (Only runs if text was found)
llm = ChatOllama(
    model="llama3.2",
    temperature=0,
    format="json" 
)

# 5. Define Prompt
prompt_template = """
You are a medical data assistant. Extract details from the text below.

STRICT RULES:
1. Use ONLY the provided text.
2. If the text does not contain a name, do NOT invent "John Doe". Return null.
3. Return valid JSON.

TEXT:
{text}

JSON FORMAT:
{{
  "patient_name": "string",
  "doctor_name": "string",
  "diagnosis_codes": ["code1", "code2"],
  "medicines": [
    {{ "name": "drug name", "dosage": "dosage instructions", "duration": "duration" }}
  ]
}}
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
chain = prompt | llm | StrOutputParser()

# 6. Run Chain
try:
    print("Sending text to LLM for structuring...")
    result = chain.invoke({"text": raw_text})
    parsed = json.loads(result)
    print(json.dumps(parsed, indent=4))
except Exception as e:
    print(f"Error: {e}")

  from .autonotebook import tqdm as notebook_tqdm


Reading file: C:\Users\harsh.raj\OneDrive - Aster DM Healthcare\Codes\LangGraph\Practical LangGraph\Workflows\PDF extraction\sample Rx 1.pdf...
--- DEBUG: Extracted 0 characters ---
If the text below is empty, we need to use OCR.
--------------------

--------------------
Sending text to LLM for structuring...
{
    "patient_name": null,
    "doctor_name": null,
    "diagnosis_codes": [],
    "medicines": []
}


: 