In [2]:
# ✅ Cell 1: Install dependencies
!pip install transformers accelerate torch torchvision torchaudio pytesseract pdf2image PyMuPDF --quiet

In [3]:
# ✅ Cell 2: Imports and model setup
import os
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [4]:
# Set device logic: OCR on CPU, Phi-3 on GPU if available
ocr_device = "cpu"
lm_device = "cuda" if torch.cuda.is_available() else "cpu"

model_id = "microsoft/phi-3-mini-128k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

2025-07-26 07:41:06.281241: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753515666.628076      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753515666.733298      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [5]:
# ✅ Cell 3: Text extractor (OCR + digital PDF)
def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext in ['.png', '.jpg', '.jpeg']:
        print("🖼️ Extracting from image using pytesseract...")
        image = Image.open(file_path).convert("RGB")
        return pytesseract.image_to_string(image)

    elif ext == '.pdf':
        try:
            print("📄 Trying digital extraction from PDF...")
            with fitz.open(file_path) as doc:
                text = "".join([page.get_text() for page in doc])
            if text.strip():
                print("✅ Digital text extraction succeeded.")
                return text
        except:
            print("❌ Digital extraction failed.")

        print("🔁 Falling back to OCR for scanned PDF...")
        images = convert_from_path(file_path)
        return "\n".join([pytesseract.image_to_string(img.convert("RGB")) for img in images])

    raise ValueError("Unsupported file type.")


In [6]:
# ✅ Cell 4: Phi-3 structured extractor
def extract_structured_data_with_phi3(text):
    prompt = f"""
You are a receipt and invoice extraction assistant. From the following invoice text, extract the following structured JSON fields only:

- invoice_number
- invoice_date
- vendor_name
- items: list of items where each item includes:
    - description (like date range or label)
    - quantity (days/items)
    - unit_price (rate per day/item)
    - total_price (for that line)
- totals_section: include all explicitly written totals such as Net Total, VAT, Final Total, etc.
- payment_terms: include the payment due date and any specific payment instructions
- customer_details: include customer name, address, and contact information
- additional_notes: any special instructions or comments

Return only JSON. Don't add explanations.

--- START OF INVOICE TEXT ---
{text}
--- END OF TEXT ---
"""

    inputs = tokenizer(prompt, return_tensors="pt").to(lm_device)
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=1024,
            temperature=0.3,
            do_sample=False
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    json_start = decoded.find("{")
    return decoded[json_start:]


In [7]:
# ✅ Cell 5: Main pipeline function
def run_invoice_pipeline(file_path):
    print("🔍 Extracting text from file...")
    raw_text = extract_text_from_file(file_path)

    print("🤖 Sending to Phi-3 for structured extraction...")
    structured_json = extract_structured_data_with_phi3(raw_text)

    print("\n✅ Final Output from Phi-3:\n")
    print(structured_json)
    return structured_json

In [8]:
# ✅ Cell 6: Run on your file (replace with your file path)
file_path = "/kaggle/input/digital-image/V067996-BIR008-SB-SE02 - ASF 131744-280.pdf"  # Adjust as needed
run_invoice_pipeline(file_path)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Extracting text from file...
📄 Trying digital extraction from PDF...
✅ Digital text extraction succeeded.
🤖 Sending to Phi-3 for structured extraction...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



✅ Final Output from Phi-3:

{

  "invoice_number": "SB-SE02",

  "invoice_date": "26 July 2024",

  "vendor_name": "Samantha Bircham",

  "items": [

    {

      "description": "THERAPY SESSION - 3 July 2024",

      "quantity": "1",

      "unit_price": "£70.00",

      "total_price": "£70.00"

    },

    {

      "description": "THERAPY SESSION - 10 July 2024",

      "quantity": "1",

      "unit_price": "£70.00",

      "total_price": "£70.00"

    },

    {

      "description": "THERAPY SESSION – 17 July 2024",

      "quantity": "1",

      "unit_price": "£70.00",

      "total_price": "£70.00"

    },

    {

      "description": "PARENT REVIEW MEETING 24/07/2024",

      "quantity": "1",

      "unit_price": "£70.00",

      "total_price": "£70.00"

    }

  ],

  "totals_section": {

    "Net Total": "£280.00"

  },

  "payment_terms": {

    "due_date": "within 14 days",

    "bank_details": {

      "name": "Miss Samantha Bircham",

      "account_number": "26795108",

 

'{\n\n  "invoice_number": "SB-SE02",\n\n  "invoice_date": "26 July 2024",\n\n  "vendor_name": "Samantha Bircham",\n\n  "items": [\n\n    {\n\n      "description": "THERAPY SESSION - 3 July 2024",\n\n      "quantity": "1",\n\n      "unit_price": "£70.00",\n\n      "total_price": "£70.00"\n\n    },\n\n    {\n\n      "description": "THERAPY SESSION - 10 July 2024",\n\n      "quantity": "1",\n\n      "unit_price": "£70.00",\n\n      "total_price": "£70.00"\n\n    },\n\n    {\n\n      "description": "THERAPY SESSION – 17 July 2024",\n\n      "quantity": "1",\n\n      "unit_price": "£70.00",\n\n      "total_price": "£70.00"\n\n    },\n\n    {\n\n      "description": "PARENT REVIEW MEETING 24/07/2024",\n\n      "quantity": "1",\n\n      "unit_price": "£70.00",\n\n      "total_price": "£70.00"\n\n    }\n\n  ],\n\n  "totals_section": {\n\n    "Net Total": "£280.00"\n\n  },\n\n  "payment_terms": {\n\n    "due_date": "within 14 days",\n\n    "bank_details": {\n\n      "name": "Miss Samantha Birch