In [1]:
!pip install transformers torch pandas pillow



In [5]:
import json
import torch
import pandas as pd
from PIL import Image
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from typing import List, Dict, Tuple

class DocumentProcessor:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.processor = LayoutLMv3Processor.from_pretrained(
            "microsoft/layoutlmv3-base", 
            apply_ocr=False,
            token=True  # Remove if using public model
        )
        self.model = LayoutLMv3ForTokenClassification.from_pretrained(
            "nielsr/layoutlmv3-finetuned-funsd"  # Public FUNSD-tuned model
        ).to(self.device)
        self.id2label = self.model.config.id2label
        print(f"Initialized processor and model on {self.device}")

    def load_document(self, json_path: str, image_path: str) -> Tuple[List[str], List[List[int]], Image.Image]:
        """Load document data from JSON and image"""
        with open(json_path) as f:
            data = json.load(f)
        
        words = data["words"]
        boxes = data["bboxes"]
        image = Image.open(image_path).convert("RGB")
        
        print(f"Loaded document with {len(words)} words from {json_path}")
        return words, boxes, image

    def preprocess(self, words: List[str], boxes: List[List[int]], image: Image.Image) -> Dict[str, torch.Tensor]:
        """Preprocess document for model input"""
        encoding = self.processor(
            image,
            words,
            boxes=boxes,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=512
        ).to(self.device)
        return encoding

    def predict(self, encoding: Dict[str, torch.Tensor]) -> List[int]:
        """Run model inference"""
        with torch.no_grad():
            outputs = self.model(**encoding)
        return outputs.logits.argmax(dim=-1).squeeze().tolist()

    def process_results(
        self, 
        words: List[str], 
        boxes: List[List[int]], 
        predictions: List[int], 
        encoding: Dict[str, torch.Tensor]
    ) -> List[Dict[str, str]]:
        """Process model predictions into structured entities"""
        tokens = self.processor.tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze())
        word_ids = encoding.word_ids()
        
        entities = []
        current_entity = {"text": "", "label": "", "bboxes": []}
        
        for idx, (token, word_id) in enumerate(zip(tokens, word_ids)):
            if word_id is None:
                continue
                
            label = self.id2label[predictions[idx]]
            
            # Handle entity boundaries
            if label.startswith("B-"):
                if current_entity["text"]:
                    entities.append(current_entity.copy())
                current_entity = {
                    "text": token.replace("▁", ""),
                    "label": label[2:],
                    "bboxes": [boxes[word_id]]
                }
            elif label.startswith("I-") and current_entity["label"] == label[2:]:
                current_entity["text"] += " " + token.replace("▁", "")
                current_entity["bboxes"].append(boxes[word_id])
            else:
                if current_entity["text"]:
                    entities.append(current_entity.copy())
                current_entity = {"text": "", "label": "", "bboxes": []}
        
        if current_entity["text"]:
            entities.append(current_entity)
            
        return entities

    def merge_bboxes(self, entities: List[Dict]) -> List[Dict]:
        """Calculate merged bounding boxes for multi-word entities"""
        for entity in entities:
            if entity["bboxes"]:
                x0 = min(box[0] for box in entity["bboxes"])
                y0 = min(box[1] for box in entity["bboxes"])
                x1 = max(box[2] for box in entity["bboxes"])
                y1 = max(box[3] for box in entity["bboxes"])
                entity["bbox"] = f"({x0}, {y0}, {x1}, {y1})"
            else:
                entity["bbox"] = "N/A"
            del entity["bboxes"]
        return entities

    def process_document(self, json_path: str, image_path: str) -> pd.DataFrame:
        """Full document processing pipeline"""
        # Load document
        words, boxes, image = self.load_document(json_path, image_path)
        
        # Preprocess
        encoding = self.preprocess(words, boxes, image)
        
        # Predict
        predictions = self.predict(encoding)
        
        # Process results
        entities = self.process_results(words, boxes, predictions, encoding)
        entities = self.merge_bboxes(entities)
        
        # Create DataFrame
        df = pd.DataFrame(entities)[["label", "text", "bbox"]]
        df.columns = ["Entity Type", "Text Content", "Bounding Box"]
        
        print(f"Extracted {len(df)} entities from document")
        return df

    def save_results(self, df: pd.DataFrame, output_path: str = "results.csv"):
        """Save results to CSV"""
        df.to_csv(output_path, index=False)
        print(f"Results saved to {output_path}")

if __name__ == "__main__":
    # Initialize processor
    processor = DocumentProcessor()
    
    # Process document
    json_file = "document.json"
    image_file = r"C:\Users\HP\Desktop\Bank Project\Invoice Images Bank Statement\Arul paul invoice.png"
    result_df = processor.process_document(json_file, image_file)
    
    # Display and save results
    print("\nExtracted Entities:")
    print(result_df)
    processor.save_results(result_df, "document_entities.csv")

LocalTokenNotFoundError: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `hf auth login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.

In [6]:
!pip install transformers datasets torch torchvision pytesseract Pillow




In [8]:
import json
import pytesseract
from PIL import Image
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
import torch

# 🔧 Set up paths
image_path = r"C:\Users\HP\Desktop\Bank Project\Invoice Images Bank Statement\Book 201.png"  # Replace with your actual image path
output_json_path = r"C:\Users\HP\Desktop\Bank Project\Json Output\output.json"

# 📷 Load image
image = Image.open(image_path).convert("RGB")

# 🧠 Load processor and model
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=True)
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")

# 🧾 Process image
encoding = processor(image, return_tensors="pt")
with torch.no_grad():
    outputs = model(**encoding)

# 📊 Get predictions
logits = outputs.logits
predicted_class = logits.argmax(-1)

# 📦 Convert to structured JSON
tokens = processor.tokenizer.convert_ids_to_tokens(encoding.input_ids[0])
boxes = encoding.bbox[0].tolist()
labels = predicted_class[0].tolist()

# 🧮 Build JSON structure
results = []
for token, box, label in zip(tokens, boxes, labels):
    if token not in processor.tokenizer.all_special_tokens:
        results.append({
            "text": token,
            "bbox": box,
            "label": model.config.id2label[label]
        })

# 💾 Save to JSON
with open(output_json_path, "w") as f:
    json.dump(results, f, indent=2)

# 📣 Print output in VS Code terminal
print(json.dumps(results, indent=2))


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[
  {
    "text": "\u0120Z",
    "bbox": [
      178,
      133,
      198,
      149
    ],
    "label": "LABEL_0"
  },
  {
    "text": "\u0120E",
    "bbox": [
      228,
      129,
      370,
      159
    ],
    "label": "LABEL_1"
  },
  {
    "text": "leg",
    "bbox": [
      228,
      129,
      370,
      159
    ],
    "label": "LABEL_1"
  },
  {
    "text": "ant",
    "bbox": [
      228,
      129,
      370,
      159
    ],
    "label": "LABEL_1"
  },
  {
    "text": "\u0120IN",
    "bbox": [
      669,
      121,
      894,
      153
    ],
    "label": "LABEL_0"
  },
  {
    "text": "VO",
    "bbox": [
      669,
      121,
      894,
      153
    ],
    "label": "LABEL_0"
  },
  {
    "text": "ICE",
    "bbox": [
      669,
      121,
      894,
      153
    ],
    "label": "LABEL_0"
  },
  {
    "text": "\u0120Inv",
    "bbox": [
      682,
      170,
      731,
      177
    ],
    "label": "LABEL_0"
  },
  {
    "text": "oice",
    "bbox": [
      682,
      170,


In [13]:
!pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Downloading transformers-4.55.0-py3-none-any.whl (11.3 MB)
   ---------------------------------------- 0.0/11.3 MB ? eta -:--:--
    --------------------------------------- 0.3/11.3 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.3 MB 3.0 MB/s eta 0:00:04
   ------ --------------------------------- 1.8/11.3 MB 3.4 MB/s eta 0:00:03
   --------- ------------------------------ 2.6/11.3 MB 3.2 MB/s eta 0:00:03
   ------------ --------------------------- 3.4/11.3 MB 3.5 MB/s eta 0:00:03
   ------------- -------------------------- 3.9/11.3 MB 3.3 MB/s eta 0:00:03
   ---------------- ----------------------- 4.7/11.3 MB 3.4 MB/s eta 0:00:02
   ------------------- -------------------- 5.5/11.3 MB 3.4 MB/s eta 0:00:02
   ---------------------- ----------------- 6.3/11.3 MB 3.4 MB/s eta 0:00:02
   ------------------------- -------------- 7.1/11.3 MB 3.5 MB/s eta 0:00:02
   -------------

In [18]:
import json
import pytesseract
from PIL import Image
from transformers import (
    LayoutLMv3Processor,
    LayoutLMv3ForTokenClassification,
    Trainer,
    TrainingArguments
)
from datasets import Dataset
import torch

# 🔧 Paths
image_path = r"C:\Users\HP\Desktop\Bank Project\Invoice Images Bank Statement\Book 201.png"
output_json_path = r"C:\Users\HP\Desktop\Bank Project\Json Output\output.json"

# 🧠 Label setup
label_list = ["O", "ACCOUNT_NUMBER", "DATE", "AMOUNT"]
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

from datasets import Dataset

# ✅ Replace ellipsis with actual dummy values
train_data = {
    "input_ids": [[101, 1234, 5678, 102]],  # token IDs
    "attention_mask": [[1, 1, 1, 1]],       # attention mask
    "bbox": [[[0, 0, 50, 50], [60, 0, 100, 50], [110, 0, 160, 50], [0, 0, 0, 0]]],  # bounding boxes
    "labels": [[0, 1, 0, 0]]                # label IDs
}

train_dataset = Dataset.from_dict(train_data)


# 🧠 Load processor and model
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=True)
model = LayoutLMv3ForTokenClassification.from_pretrained(
    "microsoft/layoutlmv3-base",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# 🏋️ Training setup
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./layoutlmv3-finetuned",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    logging_dir="./logs",
    save_steps=500,
    logging_steps=100,
    save_total_limit=2,
    remove_unused_columns=False
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=processor.tokenizer
)

# 🚀 Train the model
trainer.train()

# 📷 Load image for inference
image = Image.open(image_path).convert("RGB")

# 🔍 Inference
encoding = processor(image, return_tensors="pt")
with torch.no_grad():
    outputs = model(**encoding)

logits = outputs.logits
predicted_class = logits.argmax(-1)

# 📦 Convert to structured JSON
tokens = processor.tokenizer.convert_ids_to_tokens(encoding.input_ids[0])
boxes = encoding.bbox[0].tolist()
labels = predicted_class[0].tolist()

results = []
for token, box, label in zip(tokens, boxes, labels):
    if token not in processor.tokenizer.all_special_tokens:
        results.append({
            "text": token,
            "bbox": box,
            "label": model.config.id2label[label]
        })

# 💾 Save to JSON
with open(output_json_path, "w") as f:
    json.dump(results, f, indent=2)

# 📣 Print output
print(json.dumps(results, indent=2))


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss




[
  {
    "text": "\u0120Z",
    "bbox": [
      178,
      133,
      198,
      149
    ],
    "label": "O"
  },
  {
    "text": "\u0120E",
    "bbox": [
      228,
      129,
      370,
      159
    ],
    "label": "O"
  },
  {
    "text": "leg",
    "bbox": [
      228,
      129,
      370,
      159
    ],
    "label": "O"
  },
  {
    "text": "ant",
    "bbox": [
      228,
      129,
      370,
      159
    ],
    "label": "O"
  },
  {
    "text": "\u0120IN",
    "bbox": [
      669,
      121,
      894,
      153
    ],
    "label": "O"
  },
  {
    "text": "VO",
    "bbox": [
      669,
      121,
      894,
      153
    ],
    "label": "O"
  },
  {
    "text": "ICE",
    "bbox": [
      669,
      121,
      894,
      153
    ],
    "label": "O"
  },
  {
    "text": "\u0120Inv",
    "bbox": [
      682,
      170,
      731,
      177
    ],
    "label": "O"
  },
  {
    "text": "oice",
    "bbox": [
      682,
      170,
      731,
      177
    ],
    "label": "O"
  }