In [1]:
pip install PyMuPDF


Note: you may need to restart the kernel to use updated packages.


In [2]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import pandas as pd
import os
import re
from io import BytesIO

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

### Step 1: Run OCR, Get ocr_df, Concat ocr_df into text

In [3]:
def extract_page_number(filename):
    """
    Extract numeric page number from filename like 'page_001.png'
    """
    match = re.search(r'page[_\-]?(\d+)', filename, re.IGNORECASE)
    if match:
        return int(match.group(1))
    return None

def images_to_ocr_df(folder_path):
    """
    Run OCR with bbox info on all images in a folder, using filename-based page numbers.
    """
    ocr_dfs = []
    image_files = sorted([
        f for f in os.listdir(folder_path) 
        if f.lower().endswith(('.png', '.jpg', '.jpeg'))
    ])

    for filename in image_files:
        image_path = os.path.join(folder_path, filename)
        page_num = extract_page_number(filename)

        img = Image.open(image_path)
        df = pytesseract.image_to_data(img, output_type=pytesseract.Output.DATAFRAME)
        df = df[df.text.notnull()]
        df["text"] = df["text"].astype(str)
        df["page_num"] = page_num if page_num is not None else -1  # fallback

        ocr_dfs.append(df)

    ocr_df = pd.concat(ocr_dfs, ignore_index=True)
    return ocr_df


In [4]:
ocr_df = images_to_ocr_df("BW_12")

In [5]:
ocr_df

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text
0,5,1,1,1,1,1,130,62,215,26,92.216522,Osterreichische
1,5,1,1,1,2,1,76,92,43,73,5.081444,[)
2,5,1,1,1,2,2,130,92,263,35,91.264214,Nationalbibliothek
3,5,1,1,1,2,3,888,100,111,22,89.556755,digitalisiert
4,5,1,1,1,2,4,1007,100,30,17,97.001877,mit
...,...,...,...,...,...,...,...,...,...,...,...,...
101861,5,475,1,1,3,1,415,759,35,62,37.457565,Lore.
101862,5,475,1,1,3,2,425,848,24,7,0.000000,‘:
101863,5,475,2,1,1,1,0,0,1069,1409,95.000000,
101864,5,476,1,1,1,1,1020,276,116,228,95.000000,


In [6]:
from fuzzywuzzy import fuzz

def ocr_df_to_text_simple(ocr_df, correct_brevet=True):
    """
    Simply concatenate OCR results into plain text per page (without layout or visual structure).
    
    Parameters:
        ocr_df (pd.DataFrame): The DataFrame returned from pytesseract.image_to_data(),
                               containing word-level OCR results with columns like 'text', 'page_num', etc.
        correct_brevet (bool): If True, apply fuzzy matching to correct words that resemble 'BREVET'.

    Returns:
        str: A long string simulating the original OCR text, organized by page and ready for regex parsing.
    """
    pages = []

    # Iterate through each page number in order
    for page in sorted(ocr_df.page_num.unique()):
        # Select rows for the current page
        page_df = ocr_df[ocr_df.page_num == page]
        words = []

        # Loop through words on the page
        for word in page_df["text"]:
            # If fuzzy correction is enabled and word is close to 'BREVET', replace it
            if correct_brevet and fuzz.ratio(word.upper(), "BREVET") >= 70:
                words.append("BREVET")
            else:
                words.append(word)

        # Combine all words into a single line of text for the page, with a page header
        page_text = f"=== Extracted Text from page_{page}.png ===\n" + " ".join(words)
        pages.append(page_text)

    # Join all pages with double line breaks between them
    return "\n\n".join(pages)




In [7]:
ocr_text = ocr_df_to_text_simple(ocr_df)

### Step 2: Regex matching and metadata extraction

In [8]:
import re
import pandas as pd

def extract_patent_data(text):
    pattern = (
        r"[\s/\\_\-]*BREVET\s+(?:D[’'`]?|DE)\s*(?P<category>\w+)\s+DE\s+(?P<duration>\w+)\s+ANS[,\\s]*\n?"
        r"(?P<content>.*?)(?=[\s/\\_\-]*BREVET|\Z)"
    )
    matches = re.finditer(pattern, text, flags=re.DOTALL | re.IGNORECASE)
    results = []

    for match in matches:
        data = match.groupdict()
        block_text = match.group(0)
        data['block_text'] = block_text.strip()
        data['word_count'] = len(block_text.split())
        data['char_count'] = len(block_text)

        # Title
        title_match = re.search(r"Pour\s+(.+?)(?=\n|,|Au sieur|Aux sieurs|À madame)", block_text, re.IGNORECASE)
        data['title'] = title_match.group(1).strip() if title_match else "Unknown"

        # Patentee
        patentee_match = re.search(r"(?:Au sieur|Aux sieurs|À madame)\s+([^.,\n]+)", block_text, re.IGNORECASE)
        data['patentee'] = patentee_match.group(1).strip() if patentee_match else "Unknown"

        # Page number
        page_match = re.search(r"=== Extracted Text from page_(\d+)", block_text)
        data['page_num'] = int(page_match.group(1)) if page_match else -1

        results.append(data)

    return pd.DataFrame(results)


In [9]:
df_raw = extract_patent_data(ocr_text)

In [10]:
len(df_raw)

59

### Step 3: Enrich metadata using bbox

In [11]:
def enrich_block_with_bbox(row, ocr_df, top_margin=60, bottom_margin=120):
    # Filter OCR data for the corresponding page
    page_df = ocr_df[ocr_df.page_num == row['page_num']].copy()
    page_df = page_df.sort_values(by='top')

    # Locate the line containing the word 'BREVET'
    brevet_idx = page_df[page_df.text.str.upper() == "BREVET"]
    if brevet_idx.empty:
        return row.to_dict()  # Keep original row if BREVET is not found

    brevet_top = brevet_idx.iloc[0].top

    # Search upward for a date line (typically above BREVET)
    above = page_df[(page_df.top < brevet_top) & (page_df.top > brevet_top - top_margin)]
    date = None
    for line_top in sorted(above.top.unique(), reverse=True):
        line_words = above[above.top == line_top].sort_values(by='left')
        line_text = " ".join(line_words.text.tolist())
        if re.search(r"\b(18|19|20)\d{2}\b", line_text):
            date = line_text
            break

    # Search downward for title (Pour ...) and patentee (Au sieur, À madame, etc.)
    below = page_df[(page_df.top > brevet_top) & (page_df.top < brevet_top + bottom_margin)]
    title = None
    patentee = None
    for line_top in sorted(below.top.unique()):
        line_words = below[below.top == line_top].sort_values(by='left')
        line_text = " ".join(line_words.text.tolist())

        if not title and re.search(r"\bPour\s", line_text, re.IGNORECASE):
            title = line_text

        if not patentee and re.search(r"(Au sieur|Aux sieurs|À madame)", line_text, re.IGNORECASE):
            patentee = line_text

    # Return the enriched block, preserving original values if not found
    return {
        "page_num": row["page_num"],
        "category": row["category"],
        "duration": row["duration"],
        "title": title or row["title"],
        "patentee": patentee or row["patentee"],
        "date": date,
        "word_count": row["word_count"],
        "char_count": row["char_count"],
        "block_text": row["block_text"]
    }


In [12]:
enriched_blocks = [enrich_block_with_bbox(row, ocr_df) for _, row in df_raw.iterrows()]
df_final = pd.DataFrame(enriched_blocks)

In [13]:
len(df_final)

59

In [14]:
print("🔍 title difference：", (df_raw["title"] != df_final["title"]).sum())
print("🔍 patentee difference：", (df_raw["patentee"] != df_final["patentee"]).sum())
print("🗓️  date difference：", df_final["date"].notna().sum())


🔍 title difference： 5
🔍 patentee difference： 0
🗓️  date difference： 1
