In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Install required packages
!pip install -q torch torchvision
!pip install -q transformers
!pip install -q timm einops
!pip install -q flash-attn --no-build-isolation 2>/dev/null || echo "Flash attention not available, using standard attention"
!pip install -q opencv-python pillow matplotlib
!pip install -q scipy pandas

print("\n Packages installed")

In [None]:
# Clone YOLOv7 repository
%cd /content
!rm -rf yolov7
!git clone https://github.com/WongKinYiu/yolov7.git
%cd yolov7

print("\n YOLOv7 repository cloned")

In [None]:
# Fix torch.load for newer PyTorch versions
import os
import re

def fix_torch_load(filepath):
    """Add weights_only=False to torch.load calls"""
    if not os.path.exists(filepath):
        return False

    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    original = content

    # Pattern to find torch.load without weights_only
    patterns = [
        (r"torch\.load\(([^,]+), map_location=([^)]+)\)(?!.*weights_only)",
         r"torch.load(\1, map_location=\2, weights_only=False)"),
        (r"torch\.load\(cache_path\)(?!.*weights_only)",
         r"torch.load(cache_path, weights_only=False)"),
        (r"torch\.load\(f, map_location=torch\.device\('cpu'\)\)(?!.*weights_only)",
         r"torch.load(f, map_location=torch.device('cpu'), weights_only=False)"),
    ]

    for pattern, replacement in patterns:
        content = re.sub(pattern, replacement, content)

    if content != original:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content)
        return True
    return False

files_to_fix = [
    'models/experimental.py',
    'utils/datasets.py',
    'utils/general.py'
]

for filepath in files_to_fix:
    if fix_torch_load(filepath):
        print(f"Fixed: {filepath}")
    else:
        print(f"Skipped: {filepath}")

print("\n YOLOv7 files patched")

In [None]:
from google.colab import files
import os

print("Please upload your trained YOLOv7 weights (best.pt)...")
uploaded = files.upload()

if uploaded:
    weights_filename = list(uploaded.keys())[0]
    YOLO_WEIGHTS = f'/content/yolov7/{weights_filename}'

    # Move to yolov7 directory if not already there
    if not os.path.exists(YOLO_WEIGHTS):
        import shutil
        shutil.move(weights_filename, YOLO_WEIGHTS)

    print(f"\n Weights uploaded: {YOLO_WEIGHTS}")
else:
    print(" No weights uploaded")

In [None]:
import sys
sys.path.insert(0, '/content/yolov7')

import torch
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from pathlib import Path

# YOLOv7 imports
from models.experimental import attempt_load
from utils.general import check_img_size, non_max_suppression, scale_coords
from utils.torch_utils import select_device
from utils.datasets import letterbox

print(" YOLOv7 modules imported")

In [None]:
# Load YOLOv7 model
device = select_device('0')  # Use GPU
yolo_model = attempt_load(YOLO_WEIGHTS, map_location=device)
yolo_model.eval()

# Get model stride and image size
stride = int(yolo_model.stride.max())
img_size = check_img_size(640, s=stride)

# Get class names
names = yolo_model.module.names if hasattr(yolo_model, 'module') else yolo_model.names

print(f"\n YOLOv7 model loaded!")
print(f"   Classes: {names}")
print(f"   Image size: {img_size}")
print(f"   Device: {device}")

In [None]:
# Load Florence-2 model
from transformers import AutoProcessor, AutoModelForCausalLM

florence_model_id = "microsoft/Florence-2-base"

print("Loading Florence-2 model")
florence_processor = AutoProcessor.from_pretrained(florence_model_id, trust_remote_code=True)
florence_model = AutoModelForCausalLM.from_pretrained(
    florence_model_id,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    attn_implementation="eager"
).to(device)
florence_model.eval()

print(f"\n Florence-2 model loaded")
print(f"   Model: {florence_model_id}")

In [None]:
def run_yolo_detection(image_path, conf_threshold=0.25, iou_threshold=0.45):
    """
    Run YOLOv7 detection on an image.

    Args:
        image_path: Path to input image
        conf_threshold: Confidence threshold for detections
        iou_threshold: IoU threshold for NMS

    Returns:
        original_image: Original image (BGR)
        detections: List of [x1, y1, x2, y2, confidence, class_id]
    """
    # Load image
    img0 = cv2.imread(image_path)  # BGR
    if img0 is None:
        raise ValueError(f"Could not load image: {image_path}")

    # Preprocess
    img = letterbox(img0, img_size, stride=stride)[0]
    img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, HWC to CHW
    img = np.ascontiguousarray(img)
    img = torch.from_numpy(img).to(device)
    img = img.float() / 255.0

    if img.ndimension() == 3:
        img = img.unsqueeze(0)

    # Inference
    with torch.no_grad():
        pred = yolo_model(img)[0]

    # Apply NMS
    pred = non_max_suppression(pred, conf_threshold, iou_threshold)

    detections = []
    for det in pred:
        if len(det):
            # Rescale boxes to original image size
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0.shape).round()

            for *xyxy, conf, cls in det:
                x1, y1, x2, y2 = [int(x) for x in xyxy]
                detections.append({
                    'bbox': [x1, y1, x2, y2],
                    'confidence': float(conf),
                    'class_id': int(cls),
                    'class_name': names[int(cls)]
                })

    return img0, detections

print(" YOLOv7 detection function defined")

In [None]:
def run_florence_interpretation(image, task="<OCR>"):
    """
    Run Florence-2 interpretation on an image.
    """
    # Convert to PIL if needed
    if isinstance(image, np.ndarray):
        if len(image.shape) == 3 and image.shape[2] == 3:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)

    # Process with Florence-2
    inputs = florence_processor(
        text=task,
        images=image,
        return_tensors="pt"
    ).to(device, torch.float16)

    # Generate with cache disabled
    with torch.no_grad():
        generated_ids = florence_model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            num_beams=3,
            do_sample=False,
            use_cache=False  # Add this line to fix the error
        )

    # Decode
    generated_text = florence_processor.batch_decode(
        generated_ids,
        skip_special_tokens=False
    )[0]

    # Post-process
    result = florence_processor.post_process_generation(
        generated_text,
        task=task,
        image_size=(image.width, image.height)
    )

    return result

print(" Florence-2 interpretation function updated")

In [None]:
def process_image_pipeline(image_path, conf_threshold=0.25, padding=10):
    """
    Full pipeline: YOLOv7 detection â†’ Florence-2 interpretation.

    Args:
        image_path: Path to input image
        conf_threshold: Confidence threshold for YOLO
        padding: Extra pixels around detected region for cropping

    Returns:
        results: Dictionary with all detection and interpretation results
    """
    print(f"\n{'='*60}")
    print(f"Processing: {Path(image_path).name}")
    print('='*60)

    # Step 1: YOLOv7 Detection
    print("\n Step 1: Running YOLOv7 detection...")
    original_img, detections = run_yolo_detection(image_path, conf_threshold)
    print(f"   Found {len(detections)} door plaque(s)")

    results = {
        'image_path': image_path,
        'original_image': original_img,
        'detections': [],
        'annotated_image': original_img.copy()
    }

    if len(detections) == 0:
        print("   No plaques detected.")
        return results

    # Step 2: Process each detection with Florence-2
    print("\n Step 2: Running Florence-2 interpretation...")

    h, w = original_img.shape[:2]

    for i, det in enumerate(detections):
        x1, y1, x2, y2 = det['bbox']

        # Add padding and clip to image bounds
        x1_pad = max(0, x1 - padding)
        y1_pad = max(0, y1 - padding)
        x2_pad = min(w, x2 + padding)
        y2_pad = min(h, y2 + padding)

        # Crop detected region
        crop = original_img[y1_pad:y2_pad, x1_pad:x2_pad]

        # Run Florence-2 OCR
        print(f"\n   Plaque {i+1}:")
        print(f"   - BBox: [{x1}, {y1}, {x2}, {y2}]")
        print(f"   - Confidence: {det['confidence']:.2%}")

        # OCR
        ocr_result = run_florence_interpretation(crop, "<OCR>")
        ocr_text = ocr_result.get('<OCR>', '')
        print(f"   - OCR Text: {ocr_text}")

        # Caption (for additional context)
        caption_result = run_florence_interpretation(crop, "<CAPTION>")
        caption = caption_result.get('<CAPTION>', '')
        print(f"   - Caption: {caption}")

        # Store results
        detection_result = {
            'bbox': det['bbox'],
            'confidence': det['confidence'],
            'class_name': det['class_name'],
            'crop': crop,
            'ocr_text': ocr_text,
            'caption': caption
        }
        results['detections'].append(detection_result)

        # Draw on annotated image
        cv2.rectangle(results['annotated_image'], (x1, y1), (x2, y2), (0, 255, 0), 2)

        # Add text label
        label = f"{ocr_text[:30]}..." if len(ocr_text) > 30 else ocr_text
        label = label if label else f"Plaque {i+1}"

        # Background for text
        (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
        cv2.rectangle(results['annotated_image'],
                      (x1, y1 - th - 10), (x1 + tw + 10, y1),
                      (0, 255, 0), -1)
        cv2.putText(results['annotated_image'], label,
                    (x1 + 5, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX,
                    0.6, (0, 0, 0), 2)

    print(f"\n Processing complete")
    return results

print(" Full pipeline function defined")

In [None]:
def visualize_results(results, figsize=(16, 10)):
    """
    Visualize pipeline results.
    """
    detections = results['detections']
    n_detections = len(detections)

    if n_detections == 0:
        # Just show original image
        plt.figure(figsize=(10, 8))
        plt.imshow(cv2.cvtColor(results['original_image'], cv2.COLOR_BGR2RGB))
        plt.title('No plaques detected')
        plt.axis('off')
        plt.show()
        return

    # Create figure with subplots
    fig = plt.figure(figsize=figsize)

    # Main annotated image
    ax1 = fig.add_subplot(1, 2, 1)
    ax1.imshow(cv2.cvtColor(results['annotated_image'], cv2.COLOR_BGR2RGB))
    ax1.set_title(f'Detected Plaques ({n_detections} found)', fontsize=14)
    ax1.axis('off')

    # Cropped regions with OCR results
    ax2 = fig.add_subplot(1, 2, 2)
    ax2.axis('off')
    ax2.set_title('OCR Results', fontsize=14)

    # Create grid of crops
    n_cols = min(2, n_detections)
    n_rows = (n_detections + n_cols - 1) // n_cols

    for i, det in enumerate(detections):
        # Create subplot for each crop
        ax_crop = fig.add_subplot(n_rows, 2 + n_cols, 3 + i)
        ax_crop.imshow(cv2.cvtColor(det['crop'], cv2.COLOR_BGR2RGB))

        ocr_text = det['ocr_text'] if det['ocr_text'] else '(no text)'
        # Truncate long text
        if len(ocr_text) > 50:
            ocr_text = ocr_text[:47] + '...'

        ax_crop.set_title(f"Plaque {i+1}\n{ocr_text}", fontsize=10, wrap=True)
        ax_crop.axis('off')

    plt.tight_layout()
    plt.show()

    # Print detailed results
    print("\n" + "="*60)
    print("DETAILED RESULTS")
    print("="*60)

    for i, det in enumerate(detections):
        print(f"\n Plaque {i+1}:")
        print(f"   Confidence: {det['confidence']:.2%}")
        print(f"   Bounding Box: {det['bbox']}")
        print(f"   OCR Text: {det['ocr_text']}")
        print(f"   Caption: {det['caption']}")

print(" Visualization function defined")

In [None]:
def process_image_pipeline_timed(image_path, conf_threshold=0.25, padding=10):
    """
    Full pipeline with detailed timing.
    """
    import time
    timings = {}

    print(f"\n{'='*60}")
    print(f"Processing: {Path(image_path).name}")
    print('='*60)

    print("\n Step 1: Running YOLOv7 detection...")
    t0 = time.time()
    original_img, detections = run_yolo_detection(image_path, conf_threshold)
    timings['yolo'] = time.time() - t0
    print(f"   Found {len(detections)} door plaque(s) ({timings['yolo']:.3f}s)")

    results = {
        'image_path': image_path,
        'original_image': original_img,
        'detections': [],
        'annotated_image': original_img.copy(),
        'timings': timings
    }

    if len(detections) == 0:
        print("   No plaques detected.")
        timings['florence_ocr'] = 0
        timings['florence_caption'] = 0
        return results

    print("\n Step 2: Running Florence-2 interpretation...")

    h, w = original_img.shape[:2]
    timings['florence_ocr'] = 0
    timings['florence_caption'] = 0

    for i, det in enumerate(detections):
        x1, y1, x2, y2 = det['bbox']

        # Add padding and clip to image bounds
        x1_pad = max(0, x1 - padding)
        y1_pad = max(0, y1 - padding)
        x2_pad = min(w, x2 + padding)
        y2_pad = min(h, y2 + padding)

        # Crop detected region
        crop = original_img[y1_pad:y2_pad, x1_pad:x2_pad]

        print(f"\n   Plaque {i+1}:")
        print(f"   - BBox: [{x1}, {y1}, {x2}, {y2}]")
        print(f"   - Confidence: {det['confidence']:.2%}")

        # OCR with timing
        t0 = time.time()
        ocr_result = run_florence_interpretation(crop, "<OCR>")
        timings['florence_ocr'] += time.time() - t0
        ocr_text = ocr_result.get('<OCR>', '')
        print(f"   - OCR Text: {ocr_text}")

        # Caption with timing
        t0 = time.time()
        caption_result = run_florence_interpretation(crop, "<CAPTION>")
        timings['florence_caption'] += time.time() - t0
        caption = caption_result.get('<CAPTION>', '')
        print(f"   - Caption: {caption}")

        # Store results
        detection_result = {
            'bbox': det['bbox'],
            'confidence': det['confidence'],
            'class_name': det['class_name'],
            'crop': crop,
            'ocr_text': ocr_text,
            'caption': caption
        }
        results['detections'].append(detection_result)

        # Draw on annotated image
        cv2.rectangle(results['annotated_image'], (x1, y1), (x2, y2), (0, 255, 0), 2)

        label = f"{ocr_text[:30]}..." if len(ocr_text) > 30 else ocr_text
        label = label if label else f"Plaque {i+1}"

        (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.6, 2)
        cv2.rectangle(results['annotated_image'],
                      (x1, y1 - th - 10), (x1 + tw + 10, y1),
                      (0, 255, 0), -1)
        cv2.putText(results['annotated_image'], label,
                    (x1 + 5, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX,
                    0.6, (0, 0, 0), 2)

    # Print timing summary for this image
    total = timings['yolo'] + timings['florence_ocr'] + timings['florence_caption']
    print(f"\n    Timing: YOLO={timings['yolo']:.2f}s | OCR={timings['florence_ocr']:.2f}s | Caption={timings['florence_caption']:.2f}s | Total={total:.2f}s")

    results['timings'] = timings
    return results

print(" Timed pipeline function defined")

In [None]:
from google.colab import files
import zipfile
import os

print("Upload your dataset ZIP file")
uploaded = files.upload()

if uploaded:
    zip_filename = list(uploaded.keys())[0]
    print(f"\n Extracting {zip_filename}...")

    # Extract
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall('/content/dataset')

    # Find actual dataset path
    dataset_contents = os.listdir('/content/dataset')
    if len(dataset_contents) == 1 and os.path.isdir(f'/content/dataset/{dataset_contents[0]}'):
        DATASET_PATH = f'/content/dataset/{dataset_contents[0]}'
    else:
        DATASET_PATH = '/content/dataset'

    print(f" Dataset extracted to: {DATASET_PATH}")

    # Set image paths
    TRAIN_IMAGES = os.path.join(DATASET_PATH, 'train/images')
    VALID_IMAGES = os.path.join(DATASET_PATH, 'valid/images')
    TEST_IMAGES = os.path.join(DATASET_PATH, 'test/images')

    # Get test images
    import glob
    test_images = glob.glob(f"{TEST_IMAGES}/*.jpg") + glob.glob(f"{TEST_IMAGES}/*.png")

    print(f"\n Found {len(test_images)} test images")
    print(f"First image: {test_images[0] if test_images else 'None'}")

In [None]:
# Run pipeline on all uploaded images
all_results = []

for image_path in test_images:
    results = process_image_pipeline(
        image_path,
        conf_threshold=0.25,  # Adjust as needed
        padding=15             # Extra pixels around crops
    )
    all_results.append(results)
    visualize_results(results)

In [None]:
from tqdm import tqdm

all_results = []

for image_path in tqdm(test_images, desc="Processing"):
    results = process_image_pipeline_timed(
        image_path,
        conf_threshold=0.25,
        padding=15
    )
    all_results.append(results)

# Aggregate timing stats
yolo_times = [r['timings']['yolo'] for r in all_results]
ocr_times = [r['timings']['florence_ocr'] for r in all_results]
caption_times = [r['timings']['florence_caption'] for r in all_results]
total_times = [sum(r['timings'].values()) for r in all_results]

print(f"\n{'='*60}")
print(f"  TIMING SUMMARY ({len(all_results)} images)")
print(f"{'='*60}")
print(f"{'Component':<20} {'Total':>10} {'Average':>10} {'Min':>10} {'Max':>10}")
print(f"{'-'*60}")
print(f"{'YOLOv7':<20} {sum(yolo_times):>9.2f}s {sum(yolo_times)/len(yolo_times):>9.2f}s {min(yolo_times):>9.2f}s {max(yolo_times):>9.2f}s")
print(f"{'Florence-2 OCR':<20} {sum(ocr_times):>9.2f}s {sum(ocr_times)/len(ocr_times):>9.2f}s {min(ocr_times):>9.2f}s {max(ocr_times):>9.2f}s")
print(f"{'Florence-2 Caption':<20} {sum(caption_times):>9.2f}s {sum(caption_times)/len(caption_times):>9.2f}s {min(caption_times):>9.2f}s {max(caption_times):>9.2f}s")
print(f"{'-'*60}")
print(f"{'TOTAL':<20} {sum(total_times):>9.2f}s {sum(total_times)/len(total_times):>9.2f}s {min(total_times):>9.2f}s {max(total_times):>9.2f}s")
print(f"\n Throughput: {len(all_results)/sum(total_times):.2f} images/second")

In [None]:
def process_folder(folder_path, output_dir=None, conf_threshold=0.25):
    """
    Process all images in a folder.

    Args:
        folder_path: Path to folder containing images
        output_dir: Optional output directory for annotated images
        conf_threshold: Confidence threshold for detection

    Returns:
        results_df: DataFrame with all results
    """
    import pandas as pd
    from pathlib import Path

    folder = Path(folder_path)
    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp'}

    images = [f for f in folder.iterdir()
              if f.suffix.lower() in image_extensions]

    print(f"Found {len(images)} images in {folder_path}")

    if output_dir:
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)

    all_records = []

    for img_path in images:
        try:
            results = process_image_pipeline(
                str(img_path),
                conf_threshold=conf_threshold
            )

            # Save annotated image if output_dir specified
            if output_dir:
                out_path = output_path / f"annotated_{img_path.name}"
                cv2.imwrite(str(out_path), results['annotated_image'])

            # Record results
            for i, det in enumerate(results['detections']):
                all_records.append({
                    'image': img_path.name,
                    'plaque_id': i + 1,
                    'confidence': det['confidence'],
                    'bbox': str(det['bbox']),
                    'ocr_text': det['ocr_text'],
                    'caption': det['caption']
                })

        except Exception as e:
            print(f"Error processing {img_path.name}: {e}")

    results_df = pd.DataFrame(all_records)

    print(f"\n Processed {len(images)} images, found {len(all_records)} plaques")

    return results_df

print(" Batch processing function defined")

In [None]:
if len(all_results) > 0 and len(all_results[0]['detections']) > 0:
    sample_crop = all_results[0]['detections'][0]['crop']

    print("Testing different Florence-2 tasks on a sample crop:\n")

    tasks = [
        ("<OCR>", "OCR (text extraction)"),
        ("<OCR_WITH_REGION>", "OCR with regions"),
        ("<CAPTION>", "Caption"),
        ("<DETAILED_CAPTION>", "Detailed Caption"),
        ("<MORE_DETAILED_CAPTION>", "Very Detailed Caption"),
    ]

    for task_prompt, task_name in tasks:
        try:
            result = run_florence_interpretation(sample_crop, task_prompt)
            output = result.get(task_prompt, result)
            print(f" {task_name}:")
            print(f"   {output}\n")
        except Exception as e:
            print(f" {task_name}: {e}\n")
else:
    print("No detections available for testing.")

In [None]:
import json
import pandas as pd

def export_results(all_results, output_prefix='plaque_results'):
    """
    Export results to CSV and JSON files.
    """
    records = []

    for result in all_results:
        image_name = Path(result['image_path']).name

        for i, det in enumerate(result['detections']):
            records.append({
                'image': image_name,
                'plaque_id': i + 1,
                'confidence': det['confidence'],
                'x1': det['bbox'][0],
                'y1': det['bbox'][1],
                'x2': det['bbox'][2],
                'y2': det['bbox'][3],
                'ocr_text': det['ocr_text'],
                'caption': det['caption']
            })

    # Create DataFrame
    df = pd.DataFrame(records)

    # Save CSV
    csv_path = f'{output_prefix}.csv'
    df.to_csv(csv_path, index=False)
    print(f" Saved: {csv_path}")

    # Save JSON
    json_path = f'{output_prefix}.json'
    with open(json_path, 'w') as f:
        json.dump(records, f, indent=2)
    print(f" Saved: {json_path}")

    return df

# Export
if all_results:
    results_df = export_results(all_results)
    display(results_df)

    # Download files
    from google.colab import files
    files.download('plaque_results.csv')
    files.download('plaque_results.json')

In [None]:
%%writefile inference_pipeline.py
"""
Door Plaque Detection & Interpretation Pipeline
YOLOv7 + Florence-2

Usage:
    python inference_pipeline.py --image path/to/image.jpg --weights path/to/best.pt
"""

import argparse
import sys
import os
import torch
import cv2
import numpy as np
from PIL import Image
from pathlib import Path

def load_models(yolo_weights, device='cuda'):
    """Load YOLOv7 and Florence-2 models."""
    # Add YOLOv7 to path
    yolov7_path = Path(yolo_weights).parent
    sys.path.insert(0, str(yolov7_path))

    from models.experimental import attempt_load
    from utils.general import check_img_size
    from utils.torch_utils import select_device

    # Load YOLOv7
    device = select_device(device)
    yolo = attempt_load(yolo_weights, map_location=device)
    yolo.eval()
    stride = int(yolo.stride.max())
    img_size = check_img_size(640, s=stride)
    names = yolo.module.names if hasattr(yolo, 'module') else yolo.names

    # Load Florence-2
    from transformers import AutoProcessor, AutoModelForCausalLM
    florence_processor = AutoProcessor.from_pretrained(
        "microsoft/Florence-2-base", trust_remote_code=True
    )
    florence = AutoModelForCausalLM.from_pretrained(
        "microsoft/Florence-2-base",
        trust_remote_code=True,
        torch_dtype=torch.float16
    ).to(device)
    florence.eval()

    return {
        'yolo': yolo,
        'florence': florence,
        'florence_processor': florence_processor,
        'device': device,
        'stride': stride,
        'img_size': img_size,
        'names': names
    }

def process_image(image_path, models, conf_threshold=0.25):
    """Process a single image through the pipeline."""
    from utils.general import non_max_suppression, scale_coords
    from utils.datasets import letterbox

    # Load image
    img0 = cv2.imread(image_path)

    # YOLOv7 detection
    img = letterbox(img0, models['img_size'], stride=models['stride'])[0]
    img = img[:, :, ::-1].transpose(2, 0, 1)
    img = np.ascontiguousarray(img)
    img = torch.from_numpy(img).to(models['device']).float() / 255.0
    img = img.unsqueeze(0)

    with torch.no_grad():
        pred = models['yolo'](img)[0]
    pred = non_max_suppression(pred, conf_threshold, 0.45)

    results = []
    for det in pred:
        if len(det):
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0.shape).round()
            for *xyxy, conf, cls in det:
                x1, y1, x2, y2 = [int(x) for x in xyxy]
                crop = img0[y1:y2, x1:x2]
                crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
                crop_pil = Image.fromarray(crop_rgb)

                # Florence-2 OCR
                inputs = models['florence_processor'](
                    text="<OCR>", images=crop_pil, return_tensors="pt"
                ).to(models['device'], torch.float16)

                with torch.no_grad():
                    generated = models['florence'].generate(
                        input_ids=inputs["input_ids"],
                        pixel_values=inputs["pixel_values"],
                        max_new_tokens=512
                    )

                text = models['florence_processor'].batch_decode(
                    generated, skip_special_tokens=False
                )[0]
                ocr_result = models['florence_processor'].post_process_generation(
                    text, task="<OCR>", image_size=(crop_pil.width, crop_pil.height)
                )

                results.append({
                    'bbox': [x1, y1, x2, y2],
                    'confidence': float(conf),
                    'ocr_text': ocr_result.get('<OCR>', '')
                })

    return results

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--image', required=True, help='Path to input image')
    parser.add_argument('--weights', required=True, help='Path to YOLOv7 weights')
    parser.add_argument('--conf', type=float, default=0.25, help='Confidence threshold')
    parser.add_argument('--device', default='cuda', help='Device (cuda/cpu)')
    args = parser.parse_args()

    print("Loading models...")
    models = load_models(args.weights, args.device)

    print(f"Processing {args.image}...")
    results = process_image(args.image, models, args.conf)

    print(f"\nFound {len(results)} plaque(s):")
    for i, r in enumerate(results):
        print(f"  {i+1}. [{r['confidence']:.2%}] {r['ocr_text']}")

print("\n Standalone script saved as 'inference_pipeline.py'")