# 📄 Brevet Detection with Bounding Boxes

In [11]:
import os
import pytesseract
from PIL import Image
import pandas as pd
from fuzzywuzzy import fuzz


pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


## 🔍 OCR Extraction with Bounding Boxes

In [12]:
def extract_ocr_data(image_path):
    """Run Tesseract OCR and return DataFrame with bounding boxes."""
    image = Image.open(image_path)
    df = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME)
    
    # Ensure text column is string before using .str
    df["text"] = df["text"].astype(str)
    df = df[df.text.notnull() & (df.text.str.strip() != "")]
    
    return df.reset_index(drop=True)


## 🤖 Fuzzy Matching + Location-Based Detection

In [13]:
def is_fuzzy_match(word, target, threshold=70):
    """Check if OCR word is a fuzzy match to the target string."""
    return fuzz.ratio(word.upper(), target.upper()) >= threshold

def detect_brevet_blocks_with_bbox(ocr_df, x_range=(600, 1000), y_range=(500, 600), min_words=3):
    """
    Detect probable BREVET blocks using bounding boxes and fuzzy matching.
    Returns a list of dicts with matched block info.
    """
    matches = []
    for idx, row in ocr_df.iterrows():
        if (x_range[0] <= row.left <= x_range[1]) and (y_range[0] <= row.top <= y_range[1]):
            if is_fuzzy_match(row.text, "BREVET"):
                nearby = ocr_df[
                    (ocr_df.page_num == row.page_num) &
                    (abs(ocr_df.top - row.top) < 50) &
                    (abs(ocr_df.left - row.left) < 300)
                ]
                block_text = " ".join(nearby.sort_values(by="left").text.tolist())
                if len(block_text.split()) >= min_words:
                    matches.append({
                        "page": row.page_num,
                        "top": row.top,
                        "left": row.left,
                        "block_text": block_text
                    })
    return matches

## 📁 Process All Page Images in a Folder

In [14]:
def process_images_in_folder(folder_path):
    """Process all image files in a folder and detect BREVET blocks."""
    results = []
    for file in sorted(os.listdir(folder_path)):
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            path = os.path.join(folder_path, file)
            print(f"Processing {file}...")
            ocr_df = extract_ocr_data(path)
            blocks = detect_brevet_blocks_with_bbox(ocr_df)
            for b in blocks:
                b["image_file"] = file
                results.append(b)
    return pd.DataFrame(results)

## 🚀 Run It

In [17]:
# Example usage:
# Make sure your images are in a folder like 'output_12/'
df = process_images_in_folder('BW_12')
display(df.head())

Processing page_001.png...
Processing page_002.png...
Processing page_003.png...
Processing page_004.png...
Processing page_005.png...
Processing page_006.png...
Processing page_007.png...
Processing page_008.png...
Processing page_009.png...
Processing page_010.png...
Processing page_011.png...
Processing page_012.png...
Processing page_013.png...
Processing page_014.png...
Processing page_015.png...
Processing page_016.png...
Processing page_017.png...
Processing page_018.png...
Processing page_019.png...
Processing page_020.png...
Processing page_021.png...
Processing page_022.png...
Processing page_023.png...
Processing page_024.png...
Processing page_025.png...
Processing page_026.png...
Processing page_027.png...
Processing page_028.png...
Processing page_029.png...
Processing page_030.png...
Processing page_031.png...
Processing page_032.png...
Processing page_033.png...
Processing page_034.png...
Processing page_035.png...
Processing page_036.png...
Processing page_037.png...
P

In [19]:
len(df)

0