In [None]:

!pip3 install ultralytics



##  Microsoft TrOCR Model Comparison

| Feature                          | TrOCR Small                  | TrOCR Base                  | TrOCR Large                  |
|----------------------------------|------------------------------|-----------------------------|------------------------------|
| **Model Size (parameters)**     | ~55M                         | ~180M                       | ~400M                        |
| **Inference Speed**             | 🚀 Fast                      | ⚖️ Medium                   | 🐢 Best                    |
| **Example Dataset Support**     | IAM, FUNSD, IIIT5K           | IAM, FUNSD, IIIT5K          | IAM, FUNSD, IIIT5K           |


In [None]:
from ultralytics import YOLO
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
from datetime import datetime
import re
from ultralytics import YOLO
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
from datetime import datetime
import re
import locale

# Set French locale for date formatting (adjust if needed)

# Function to clean and format date
def clean_and_format_date(raw_date: str) -> str:
    cleaned = raw_date.replace(" ", "")
    try:
        parsed_date = datetime.strptime(cleaned, "%d%m%Y")
        month_map = {
            1: "Janv", 2: "Févr", 3: "Mars", 4: "Avr", 5: "Mai", 6: "Juin",
            7: "Juil", 8: "Août", 9: "Sept", 10: "Oct", 11: "Nov", 12: "Déc"
        }
        day = parsed_date.day
        month = month_map[parsed_date.month]
        year = parsed_date.year
        return f"{day:02d} {month} {year}"
    except Exception:
        return re.sub(r"(\d{2})(\d{2})(\d{4})", r"\1/\2/\3", cleaned)

yolo_model = YOLO("best.pt")

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-large-stage1")
trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-large-stage1")

image_path = "output.jpg"

results = yolo_model(image_path)[0]

label_map = results.names

image = Image.open(image_path).convert("RGB")

structured_data = {}
for box in results.boxes:
    cls_id = int(box.cls[0].item())
    label = label_map[cls_id].lower()

    x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
    cropped_img = image.crop((x1, y1, x2, y2))

    pixel_values = processor(images=cropped_img, return_tensors="pt").pixel_values
    generated_ids = trocr_model.generate(pixel_values)
    predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

    # 📌 Handle fields based on label
    if label == "signature":
        structured_data["signature"] = bool(predicted_text)
    elif label == "date":
        structured_data["date"] = clean_and_format_date(predicted_text)
    elif label == "amount_in_numbers":
        cleaned = predicted_text.replace(" ", "")
        cleaned = re.sub(r"1-$", "", cleaned)
        structured_data["amount_in_numbers"] = cleaned
    else:
        structured_data[label] = predicted_text


for box in results.boxes:
    cls_id = int(box.cls[0].item())
    label = label_map[cls_id].lower()

    x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
    cropped_img = image.crop((x1, y1, x2, y2))

    pixel_values = processor(images=cropped_img, return_tensors="pt").pixel_values
    generated_ids = trocr_model.generate(pixel_values)
    predicted_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

    # 📌 Handle fields based on label
    if label == "signature":
        structured_data["signature"] = bool(predicted_text)
    elif label == "date":
        structured_data["date"] = clean_and_format_date(predicted_text)
    elif label == "amount_in_numbers":
        cleaned = predicted_text.replace(" ", "")
        cleaned = re.sub(r"1-$", "", cleaned)
        structured_data["amount_in_numbers"] = cleaned
    else:
        structured_data[label] = predicted_text
# 8️⃣ Print final structured result
print("\n📦 Structured Output:")
for key, value in structured_data.items():
    print(f"{key}: {value}")

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 16,
  "num_channels": 3,
  "num_hidden_layers": 24,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 1024,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.50.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_head


image 1/1 /content/output.jpg: 320x640 1 payee, 1 amount_in_words, 1 amount_in_numbers, 1 date, 1 bank_name, 1 signature, 9.6ms
Speed: 2.0ms preprocess, 9.6ms inference, 1.5ms postprocess per image at shape (1, 3, 320, 640)


In [None]:
# 8️⃣ Print final structured result
print("\n📦 Structured Output:")
for key, value in structured_data.items():
    print(f"{key}: {value}")


📦 Structured Output:
signature: True
bank_name: AXIS BANK
amount_in_words: Seven lasts sixty seven thousand
amount_in_numbers: 767,000
date: 19 Janv 2016
payee: B. AmuLxA Prased
