In [1]:

import warnings
import cv2
import os
import sys
import pytesseract
import numpy as np
from datasets import DatasetDict
from PIL import Image

# zeige keine Warnungen an
warnings.filterwarnings("ignore")

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.ocr_pipeline import OCRPreprocessor, OCRPostProcessor
from src.utils import rotate_image, pil_to_cv, from_cv_to_pil

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Datensatz initialisieren
dataset = DatasetDict.load_from_disk("../data/interim_rgb")

In [3]:
from typing import Union
class OCRPipeline:
    def __init__(self, image: Union[np.ndarray, Image.Image]):
        """OCR-Pipeline zu Vorbereitung des Dokumentes, 
        Extraktion des Textes und Aufbereitung des extrahierten Textes.

        Args:
            Args:
            image (Union[np.ndarray, Image.Image]): Das Eingangsbild als NumPy-Array oder PIL.Image.Image.
        """
        self.raw_image = image
        self.preprocessed_image = None
        self.ocr_output = ""

    def preprocess(self) -> None:
        """Initialisiert und wendet den OCRPreprocessor an, speichert das verarbeitete Bild."""
        preprocessor = OCRPreprocessor(self.raw_image)
        preprocessor.cropping(buffer_size=10)
        preprocessor.to_gray()
        preprocessor.correct_skew()
        preprocessor.sharpen(kernel_type="laplace_standard")
        preprocessor.opening(kernel=(1,1), iterations=2)
        preprocessor.power_law_transform(gamma=2)
        self.preprocessed_image = preprocessor.get_image()

    def extract_text(self) -> None:
        """Wendet PyTesseract auf das vorverarbeitete Bild an und speichert den Text."""
        self.ocr_output = pytesseract.image_to_string(self.preprocessed_image)

    def postprocess(self) -> None:
        """Initialisiert und wendet den OCRPostProcessor auf den extrahierten Text an."""
        if self.ocr_output.strip():  # Prüft, ob `ocr_output` nicht leer ist
            postprocessor = OCRPostProcessor(self.ocr_output)
            # Anwenden verschiedener Methoden
            postprocessor.identify_language()
            postprocessor.remove_special_characters()
            postprocessor.lowercase()
            postprocessor.remove_stopwords()
            postprocessor.remove_extra_spaces()
            
            # Aufbereiteten OCR-Output extrahieren
            self.ocr_output = postprocessor.get_text()
        else:
            self.ocr_output = "no text found in document image with ocr!"

    def get_output(self):
        """Gibt den aufbereiteten OCR-Output zurück."""
        return self.ocr_output

In [4]:
for split in dataset.keys():
    dataset[split] = dataset[split].add_column("text", [""]*len(dataset[split]))

In [12]:
from datasets import DatasetDict
from tqdm import tqdm
import gc
# Erstellen eines Subsets mit 50 Beispielen aus einem Split
subset = dataset["train"].select(range(50))

# Funktion zum Anwenden der OCR auf ein Subset
def apply_ocr_to_subset(subset):
    for i in tqdm(range(len(subset)), desc=f"Processing subset"):
        example = subset[i]
        
        ocr_pipeline = OCRPipeline(example["image"])
        
        ocr_pipeline.preprocess()
        ocr_pipeline.extract_text()
        ocr_pipeline.postprocess()
        
        subset = subset.map(
            lambda example, idx: {"text": ocr_pipeline.get_output()} if idx == i else example,
            with_indices=True
        )
        
        del ocr_pipeline, example
        gc.collect()
        
    return subset

In [13]:
# Anwenden der OCR auf das Subset
processed_subset = apply_ocr_to_subset(subset)

Map: 100%|██████████| 50/50 [00:01<00:00, 28.25 examples/s]
Map: 100%|██████████| 50/50 [00:02<00:00, 19.25 examples/s]7s/it]
Map: 100%|██████████| 50/50 [00:00<00:00, 182.46 examples/s]s/it]
Map: 100%|██████████| 50/50 [00:01<00:00, 31.00 examples/s]9s/it]
Map: 100%|██████████| 50/50 [00:00<00:00, 103.57 examples/s]s/it]
Map: 100%|██████████| 50/50 [00:00<00:00, 96.98 examples/s] s/it]
Map: 100%|██████████| 50/50 [00:03<00:00, 15.84 examples/s]1s/it]
Map: 100%|██████████| 50/50 [00:00<00:00, 105.46 examples/s]s/it]
Map: 100%|██████████| 50/50 [00:01<00:00, 49.04 examples/s]6s/it]
Map: 100%|██████████| 50/50 [00:00<00:00, 140.61 examples/s]s/it]
Map: 100%|██████████| 50/50 [00:00<00:00, 172.72 examples/s]7s/it]
Map: 100%|██████████| 50/50 [00:00<00:00, 146.65 examples/s]8s/it]
Map: 100%|██████████| 50/50 [00:00<00:00, 142.40 examples/s]3s/it]
Map: 100%|██████████| 50/50 [00:00<00:00, 191.22 examples/s]4s/it]
Map: 100%|██████████| 50/50 [00:00<00:00, 179.37 examples/s]1s/it]
Map: 100%|█

In [14]:
# Überprüfen der Ergebnisse
print(processed_subset[0])
print(processed_subset[1])
print(processed_subset[2])

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1728x2292 at 0xFFFF158D2910>, 'doc_category': 'Letter', 'text': 'tobacco institute 1875 1 street northwest mighael j kerrigan washington dq song vice president 202 457 9800 b00 424 0876 state activities 202 4874888 january 24 1984 dear site enclosed please find uly aligned lobbyist regis tration 1983 85 mr n dean morgan algo accordance item 9 sees sceaeta form attached current list tobacco institute members assessed association dues 500 five hundred dollars per year questions ponuerning infot mation please feel free call office sincerely ry michael j enter fmm enclosures washington state public disclosure commission 403 evergreen plaza fj 42 olympia wa 98504 tnwl 0029158'}
{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1728x2292 at 0xFFFF158D1250>, 'doc_category': 'Report', 'text': 'communicating issues 1975 tobacco institute decided try little experiment previous decade battered media surgeon general report

In [5]:
import gc
from tqdm import tqdm

def apply_ocr_to_dataset(dataset: DatasetDict) -> DatasetDict:
    """
    Diese Methode wendet die OCR (Optical Character Recognition) auf alle Bilder in jedem Split (train, validation, test) eines Huggingface-Datensatzes an und fügt ein neues Feature hinzu, das den erkannten Text enthält.
    """
    for split in dataset.keys():
        for i in tqdm(range(len(dataset[split])), desc=f"Processing {split}"):
            example = dataset[split][i]
            
            ocr_pipeline = OCRPipeline(example["image"])
            
            ocr_pipeline.preprocess()
            ocr_pipeline.extract_text()
            ocr_pipeline.postprocess()
            
            dataset[split][i]["text"] = ocr_pipeline.get_output()
            
            del ocr_pipeline, example
            gc.collect()
            
    return dataset

In [6]:
processed_dataset = apply_ocr_to_dataset(dataset)

Processing train: 100%|██████████| 2436/2436 [2:04:18<00:00,  3.06s/it]  
Processing validation: 100%|██████████| 523/523 [26:32<00:00,  3.04s/it]
Processing test: 100%|██████████| 523/523 [26:03<00:00,  2.99s/it]


In [10]:
processed_dataset["train"][0]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1728x2292>,
 'doc_category': 'Letter',
 'text': ''}

In [9]:
# Funktion, die prüft, ob der Text leer ist
def is_empty_string(example):
    return example["text"] == ""

# Zählen der leeren Strings in jedem Split
empty_counts = {}
for split in processed_dataset.keys():
    empty_count = sum(1 for example in processed_dataset[split] if is_empty_string(example))
    empty_counts[split] = empty_count

# Ausgabe der Ergebnisse
for split, count in empty_counts.items():
    print(f"Anzahl der leeren Strings im '{split}'-Split: {count}")

Anzahl der leeren Strings im 'train'-Split: 2436
Anzahl der leeren Strings im 'validation'-Split: 523
Anzahl der leeren Strings im 'test'-Split: 523


In [8]:
processed_dataset.save_to_disk("../data/processed")

Saving the dataset (5/5 shards): 100%|██████████| 2436/2436 [01:15<00:00, 32.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 523/523 [00:10<00:00, 51.96 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 523/523 [00:22<00:00, 22.98 examples/s]
