**(ground_truth=.txt)Claude**

**1.Doctr**

In [None]:
!pip install python-doctr



In [None]:
!pip install tf2onnx



In [None]:
import os
import time
from doctr.models import ocr_predictor
from doctr.io import DocumentFile

# Initialize doctr OCR model
ocr_model = ocr_predictor(pretrained=True)

def extract_text_from_image(image_path):
    doc = DocumentFile.from_images(image_path)
    result = ocr_model(doc)
    extracted_text = []
    for page in result.pages:
        for block in page.blocks:
            for line in block.lines:
                for word in line.words:
                    extracted_text.append(word.value)
    return ' '.join(extracted_text)

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def calculate_metrics(extracted_text, reference_text):
    extracted_words = extracted_text.split()
    reference_words = reference_text.split()

    # Count correct words
    correct_words = sum(1 for word in extracted_words if word in reference_words)

    # Calculate metrics
    total_extracted = len(extracted_words)
    total_reference = len(reference_words)

    precision = 0.0 if total_extracted == 0 else correct_words / total_extracted
    recall = 0.0 if total_reference == 0 else correct_words / total_reference
    f1 = 0.0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'correct_words': correct_words,
        'extracted_words': total_extracted,
        'reference_words': total_reference
    }

# Directories containing the images and text files
image_dir = r'/content/images_analyse'
text_dir = r'/content/ground_truth'

# Variables to track metrics
total_precision = 0.0
total_recall = 0.0
total_f1 = 0.0
num_files = 0
total_execution_time = 0.0

# List of supported image extensions
image_extensions = ('.jpg', '.jpeg', '.png')

# Iterate over all files in the image directory
for image_filename in os.listdir(image_dir):
    # Check if file is a supported image type
    if image_filename.lower().endswith(image_extensions):
        # Get the base filename without extension
        base_filename = os.path.splitext(image_filename)[0]

        # Construct the full paths
        image_path = os.path.join(image_dir, image_filename)
        text_file_path = os.path.join(text_dir, f"{base_filename}.txt")

        # Check if the corresponding text file exists
        if os.path.exists(text_file_path):
            try:
                start_time = time.time()

                # Extract text from the image
                extracted_text = extract_text_from_image(image_path)

                # Read text from the pre-existing text file
                pre_existing_text = read_text_file(text_file_path)

                # Calculate metrics
                metrics = calculate_metrics(extracted_text, pre_existing_text)

                # Add to totals
                total_precision += metrics['precision']
                total_recall += metrics['recall']
                total_f1 += metrics['f1']
                num_files += 1

                # Calculate execution time
                execution_time = time.time() - start_time
                total_execution_time += execution_time

                # Print individual file results with word counts
                print(f"Processed {image_filename}:")
                print(f"Reference words: {metrics['reference_words']}")
                print(f"Extracted words: {metrics['extracted_words']}")
                print(f"Correct words: {metrics['correct_words']}")
                print(f"Precision: {metrics['precision']:.2f}")
                print(f"Recall: {metrics['recall']:.2f}")
                print(f"F1 Score: {metrics['f1']:.2f}")
                print(f"Execution Time: {execution_time:.2f} seconds\n")

            except Exception as e:
                print(f"Error processing {image_filename}: {str(e)}")
        else:
            print(f"Text file not found for image: {image_filename}")

# Calculate and output the averages
if num_files > 0:
    average_precision = total_precision / num_files
    average_recall = total_recall / num_files
    average_f1 = total_f1 / num_files
    average_execution_time = total_execution_time / num_files

    print("\nSummary Statistics:")
    print(f"Number of files processed: {num_files}")
    print(f"Average Precision: {average_precision:.2f}")
    print(f"Average Recall: {average_recall:.2f}")
    print(f"Average F1 Score: {average_f1:.2f}")
    print(f"Average Execution Time: {average_execution_time:.2f} seconds per file")
else:
    print("No matching text files found for any images.")



Processed 8.jpeg:
Reference words: 119
Extracted words: 103
Correct words: 77
Precision: 0.75
Recall: 0.65
F1 Score: 0.69
Execution Time: 14.98 seconds

Processed 17.jpeg:
Reference words: 149
Extracted words: 140
Correct words: 123
Precision: 0.88
Recall: 0.83
F1 Score: 0.85
Execution Time: 19.69 seconds

Processed 7.jpeg:
Reference words: 246
Extracted words: 226
Correct words: 196
Precision: 0.87
Recall: 0.80
F1 Score: 0.83
Execution Time: 25.49 seconds

Processed 11.jpeg:
Reference words: 193
Extracted words: 173
Correct words: 151
Precision: 0.87
Recall: 0.78
F1 Score: 0.83
Execution Time: 24.55 seconds

Processed 10.jpeg:
Reference words: 223
Extracted words: 209
Correct words: 187
Precision: 0.89
Recall: 0.84
F1 Score: 0.87
Execution Time: 24.94 seconds

Processed 2.jpeg:
Reference words: 385
Extracted words: 342
Correct words: 288
Precision: 0.84
Recall: 0.75
F1 Score: 0.79
Execution Time: 37.18 seconds

Processed 13.jpeg:
Reference words: 193
Extracted words: 176
Correct words

**EasyOCR**

In [None]:
!pip install easyocr

Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)
Downloading easyocr-1.7.2-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_bidi-0.6.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (286 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.8/286.8 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collect

In [None]:
import os
import time
import easyocr

# Initialize easyocr Reader with French and English support
reader = easyocr.Reader(['en', 'fr'])

def extract_text_from_image(image_path):
    # Perform OCR
    result = reader.readtext(image_path)
    # Extract text
    extracted_text = ' '.join([item[1] for item in result])
    return extracted_text

def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def calculate_metrics(extracted_text, reference_text):
    extracted_words = extracted_text.split()
    reference_words = reference_text.split()
    extracted_words_set = set(extracted_words)
    reference_words_set = set(reference_words)

    correct_words = sum(1 for word in reference_words if word in extracted_words_set)

    # Calculate precision and recall
    precision = correct_words / len(extracted_words) if extracted_words else 0.0
    recall = correct_words / len(reference_words) if reference_words else 0.0
    f1_score = calculate_f1(precision, recall)

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'reference_count': len(reference_words),
        'extracted_count': len(extracted_words),
        'correct_count': correct_words
    }

def calculate_f1(precision, recall):
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

# Directories containing the images and text files
image_dir = r'/content/images_analyse'
text_dir = r'/content/ground_truth'

# Variables to track metrics
total_precision = 0.0
total_recall = 0.0
total_f1 = 0.0
num_files = 0
total_execution_time = 0.0

# List of supported image extensions
image_extensions = ('.jpg', '.jpeg', '.png')

# Store results for each file
results = []

# Store total word counts
total_reference_words = 0
total_extracted_words = 0
total_correct_words = 0

# Iterate over all files in the image directory
for image_filename in os.listdir(image_dir):
    # Check if file is a supported image type
    if image_filename.lower().endswith(image_extensions):
        # Get the base filename without extension
        base_filename = os.path.splitext(image_filename)[0]

        # Construct the full paths
        image_path = os.path.join(image_dir, image_filename)
        text_file_path = os.path.join(text_dir, f"{base_filename}.txt")

        # Check if the corresponding text file exists
        if os.path.exists(text_file_path):
            try:
                print(f"\nProcessing {image_filename}...")
                start_time = time.time()

                # Extract text from the image
                extracted_text = extract_text_from_image(image_path)

                # Read text from the pre-existing text file
                pre_existing_text = read_text_file(text_file_path)

                # Calculate metrics
                metrics = calculate_metrics(extracted_text, pre_existing_text)
                precision = metrics['precision']
                recall = metrics['recall']
                f1_score = metrics['f1_score']

                # Update totals
                total_precision += precision
                total_recall += recall
                total_f1 += f1_score
                total_reference_words += metrics['reference_count']
                total_extracted_words += metrics['extracted_count']
                total_correct_words += metrics['correct_count']
                num_files += 1

                # Calculate execution time
                execution_time = time.time() - start_time

                # Store results
                results.append({
                    'filename': image_filename,
                    'precision': precision,
                    'recall': recall,
                    'f1_score': f1_score,
                    'execution_time': execution_time,
                    'reference_count': metrics['reference_count'],
                    'extracted_count': metrics['extracted_count'],
                    'correct_count': metrics['correct_count']
                })

                # Print individual file results
                print(f"Results for {image_filename}:")
                print(f"Reference words: {metrics['reference_count']}")
                print(f"Extracted words: {metrics['extracted_count']}")
                print(f"Correct words: {metrics['correct_count']}")
                print(f"Precision: {precision:.2f}")
                print(f"Recall: {recall:.2f}")
                print(f"F1 Score: {f1_score:.2f}")
                print(f"Execution Time: {execution_time:.2f} seconds")

                # Print extracted text for verification
                print("\nExtracted text (first 100 characters):")
                print(extracted_text[:100] + "...")

            except Exception as e:
                print(f"Error processing {image_filename}: {str(e)}")
        else:
            print(f"Text file not found for image: {image_filename}")

# Calculate and output the averages
if num_files > 0:
    average_precision = total_precision / num_files
    average_recall = total_recall / num_files
    average_f1 = total_f1 / num_files
    average_execution_time = sum(r['execution_time'] for r in results) / num_files

    print("\n" + "="*50)
    print("SUMMARY STATISTICS:")
    print("="*50)
    print(f"Number of files processed: {num_files}")
    print("\nTotal Word Counts:")
    print(f"Total Reference Words: {total_reference_words}")
    print(f"Total Extracted Words: {total_extracted_words}")
    print(f"Total Correct Words: {total_correct_words}")
    print(f"\nAverage Precision: {average_precision:.2f}")
    print(f"Average Recall: {average_recall:.2f}")
    print(f"Average F1 Score: {average_f1:.2f}")
    print(f"Average Execution Time: {average_execution_time:.2f} seconds per file")

    # Find best and worst performing files
    best_f1 = max(results, key=lambda x: x['f1_score'])
    worst_f1 = min(results, key=lambda x: x['f1_score'])

    print("\nBest performing file:")
    print(f"Filename: {best_f1['filename']}")
    print(f"F1 Score: {best_f1['f1_score']:.2f}")
    print(f"Reference words: {best_f1['reference_count']}")
    print(f"Extracted words: {best_f1['extracted_count']}")
    print(f"Correct words: {best_f1['correct_count']}")

    print("\nWorst performing file:")
    print(f"Filename: {worst_f1['filename']}")
    print(f"F1 Score: {worst_f1['f1_score']:.2f}")
    print(f"Reference words: {worst_f1['reference_count']}")
    print(f"Extracted words: {worst_f1['extracted_count']}")
    print(f"Correct words: {worst_f1['correct_count']}")
else:
    print("No matching text files found for any images.")




Processing 8.jpeg...
Results for 8.jpeg:
Reference words: 119
Extracted words: 106
Correct words: 71
Precision: 0.67
Recall: 0.60
F1 Score: 0.63
Execution Time: 39.76 seconds

Extracted text (first 100 characters):
BIOCHIMIE SANGUINE analyse Resultat Unité Valeurs deréferense Résurarantérieyr 4 Sodium 137 mmoVl 13...

Processing 17.jpeg...
Results for 17.jpeg:
Reference words: 149
Extracted words: 138
Correct words: 111
Precision: 0.80
Recall: 0.74
F1 Score: 0.77
Execution Time: 34.87 seconds

Extracted text (first 100 characters):
BIOCHIMIE SANGUINE (suite) Analyse Resultat Unlté Yaleurs_de_référence Besullatantarieur O DFG estim...

Processing 7.jpeg...
Results for 7.jpeg:
Reference words: 246
Extracted words: 223
Correct words: 197
Precision: 0.88
Recall: 0.80
F1 Score: 0.84
Execution Time: 59.28 seconds

Extracted text (first 100 characters):
BIOCHIMIE SANGUINE Analyse Resultat Unite Valeurs_de_référence Résultat antérieur du: CRP 1,6 mg/l (...

Processing 11.jpeg...
Results for 1

**Paddleocr**

In [None]:
!pip install paddlepaddle


Collecting paddlepaddle
  Downloading paddlepaddle-2.6.2-cp310-cp310-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting astor (from paddlepaddle)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting opt-einsum==3.3.0 (from paddlepaddle)
  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Downloading paddlepaddle-2.6.2-cp310-cp310-manylinux1_x86_64.whl (126.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: opt-einsum, astor, paddlepaddle
  Attempting uninstall: opt-einsum
    Found existing installation: opt_einsum 3.4.0
    Uninstalling opt_einsum-3.4.0:
      Successfully uninstalled opt_einsum-3.4.0
Successful

In [None]:
!pip install paddleocr

Collecting paddleocr
  Downloading paddleocr-2.9.1-py3-none-any.whl.metadata (8.5 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post6-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting rapidfuzz (from paddleocr)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting python-docx (from paddleocr)
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting albumentations==1.4.10 (from paddleocr)
  Downloading albumentations-1.4.10-py3-none-any.whl.metadata (38 

In [None]:
import os
import time
from paddleocr import PaddleOCR

# Initialize PaddleOCR with both English and French support
print("Initializing PaddleOCR...")
ocr = PaddleOCR(use_angle_cls=True, lang='french', use_gpu=False)
print("PaddleOCR initialized.")

def extract_text_from_image(image_path):
    try:
        result = ocr.ocr(image_path, cls=True)
        if result is None or len(result) == 0:
            return ""

        extracted_text = []
        for line in result:
            line_text = ' '.join([word_info[1][0] for word_info in line])
            extracted_text.append(line_text)
        return ' '.join(extracted_text)
    except Exception as e:
        print(f"Error in OCR processing: {str(e)}")
        return ""

def read_text_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except UnicodeDecodeError:
        try:
            with open(file_path, 'r', encoding='latin-1') as file:
                return file.read()
        except Exception as e:
            print(f"Error reading file: {str(e)}")
            return ""

def calculate_metrics(extracted_text, reference_text):
    extracted_words = extracted_text.lower().split()
    reference_words = reference_text.lower().split()
    extracted_words_set = set(extracted_words)
    reference_words_set = set(reference_words)

    correct_words = len(extracted_words_set.intersection(reference_words_set))

    precision = correct_words / len(extracted_words) if extracted_words else 0.0
    recall = correct_words / len(reference_words) if reference_words else 0.0
    f1_score = calculate_f1(precision, recall)

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'reference_count': len(reference_words),
        'extracted_count': len(extracted_words),
        'correct_count': correct_words
    }

def calculate_f1(precision, recall):
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

# Directories containing the images and text files
image_dir = r'/content/images_analyse'
text_dir = r'/content/ground_truth'

# Variables to track metrics
total_precision = 0.0
total_recall = 0.0
total_f1 = 0.0
num_files = 0
total_execution_time = 0.0

# List of supported image extensions
image_extensions = ('.jpg', '.jpeg', '.png')

# Store results for each file
results = []

# Store total word counts
total_reference_words = 0
total_extracted_words = 0
total_correct_words = 0

print("\nStarting OCR evaluation...")
print("="*50)

# Iterate over all files in the image directory
for image_filename in os.listdir(image_dir):
    if image_filename.lower().endswith(image_extensions):
        base_filename = os.path.splitext(image_filename)[0]
        image_path = os.path.join(image_dir, image_filename)
        text_file_path = os.path.join(text_dir, f"{base_filename}.txt")

        if os.path.exists(text_file_path):
            try:
                print(f"\nProcessing {image_filename}...")
                start_time = time.time()

                extracted_text = extract_text_from_image(image_path)
                pre_existing_text = read_text_file(text_file_path)

                # Calculate metrics
                metrics = calculate_metrics(extracted_text, pre_existing_text)

                # Update totals
                total_precision += metrics['precision']
                total_recall += metrics['recall']
                total_f1 += metrics['f1_score']
                total_reference_words += metrics['reference_count']
                total_extracted_words += metrics['extracted_count']
                total_correct_words += metrics['correct_count']
                num_files += 1

                execution_time = time.time() - start_time

                # Store results
                results.append({
                    'filename': image_filename,
                    'metrics': metrics,
                    'execution_time': execution_time
                })

                # Print individual file results
                print(f"Results for {image_filename}:")
                print(f"Reference words: {metrics['reference_count']}")
                print(f"Extracted words: {metrics['extracted_count']}")
                print(f"Correct words: {metrics['correct_count']}")
                print(f"Precision: {metrics['precision']:.2f}")
                print(f"Recall: {metrics['recall']:.2f}")
                print(f"F1 Score: {metrics['f1_score']:.2f}")
                print(f"Execution Time: {execution_time:.2f} seconds")

                print("\nExtracted text sample (first 100 characters):")
                print(extracted_text[:100] + "..." if len(extracted_text) > 100 else extracted_text)

            except Exception as e:
                print(f"Error processing {image_filename}: {str(e)}")
        else:
            print(f"Text file not found for image: {image_filename}")

# Calculate and output the averages
if num_files > 0:
    average_precision = total_precision / num_files
    average_recall = total_recall / num_files
    average_f1 = total_f1 / num_files
    average_execution_time = sum(r['execution_time'] for r in results) / num_files

    print("\n" + "="*50)
    print("SUMMARY STATISTICS:")
    print("="*50)
    print(f"Number of files processed: {num_files}")

    print("\nTotal Word Counts:")
    print(f"Total Reference Words: {total_reference_words}")
    print(f"Total Extracted Words: {total_extracted_words}")
    print(f"Total Correct Words: {total_correct_words}")

    print(f"\nAverage Metrics:")
    print(f"Average Precision: {average_precision:.2f}")
    print(f"Average Recall: {average_recall:.2f}")
    print(f"Average F1 Score: {average_f1:.2f}")
    print(f"Average Execution Time: {average_execution_time:.2f} seconds per file")

    # Find best and worst performing files
    best_f1 = max(results, key=lambda x: x['metrics']['f1_score'])
    worst_f1 = min(results, key=lambda x: x['metrics']['f1_score'])

    print("\nBest performing file:")
    print(f"Filename: {best_f1['filename']}")
    print(f"F1 Score: {best_f1['metrics']['f1_score']:.2f}")
    print(f"Reference words: {best_f1['metrics']['reference_count']}")
    print(f"Extracted words: {best_f1['metrics']['extracted_count']}")
    print(f"Correct words: {best_f1['metrics']['correct_count']}")

    print("\nWorst performing file:")
    print(f"Filename: {worst_f1['filename']}")
    print(f"F1 Score: {worst_f1['metrics']['f1_score']:.2f}")
    print(f"Reference words: {worst_f1['metrics']['reference_count']}")
    print(f"Extracted words: {worst_f1['metrics']['extracted_count']}")
    print(f"Correct words: {worst_f1['metrics']['correct_count']}")
else:
    print("No matching text files found for any images.")

Initializing PaddleOCR...
download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:00<00:00, 9571.34it/s]


download https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar to /root/.paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/latin_PP-OCRv3_rec_infer.tar


100%|██████████| 9930/9930 [00:00<00:00, 18182.14it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:00<00:00, 5574.27it/s]

[2024/10/31 05:58:04] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25,




PaddleOCR initialized.

Starting OCR evaluation...

Processing 8.jpeg...
[2024/10/31 05:58:06] ppocr DEBUG: dt_boxes num : 53, elapsed : 0.4596884250640869
[2024/10/31 05:58:06] ppocr DEBUG: cls num  : 53, elapsed : 0.18625402450561523
[2024/10/31 05:58:09] ppocr DEBUG: rec_res num  : 53, elapsed : 2.6016993522644043
Results for 8.jpeg:
Reference words: 119
Extracted words: 65
Correct words: 31
Precision: 0.48
Recall: 0.26
F1 Score: 0.34
Execution Time: 3.42 seconds

Extracted text sample (first 100 characters):
BIOCHIMIE SANGUINE Anaiyse Résultat Unite Va eference du: Sodium 137 mmol/l 135-145 133 26/10/24 Ali...

Processing 17.jpeg...
[2024/10/31 05:58:09] ppocr DEBUG: dt_boxes num : 50, elapsed : 0.19984173774719238
[2024/10/31 05:58:10] ppocr DEBUG: cls num  : 50, elapsed : 0.1641373634338379
[2024/10/31 05:58:13] ppocr DEBUG: rec_res num  : 50, elapsed : 2.9998340606689453
Results for 17.jpeg:
Reference words: 149
Extracted words: 103
Correct words: 62
Precision: 0.60
Recall: 0.42

KeyboardInterrupt: 

In [None]:
import os
import time
from paddleocr import PaddleOCR

# Initialize PaddleOCR with both English and French support
print("Initializing PaddleOCR...")
ocr = PaddleOCR(use_angle_cls=True, lang='french', use_gpu=False)
print("PaddleOCR initialized.")

def extract_text_from_image(image_path):
    try:
        result = ocr.ocr(image_path, cls=True)
        if result is None or len(result) == 0:
            return ""

        extracted_text = []
        for line in result:
            line_text = ' '.join([word_info[1][0] for word_info in line])
            extracted_text.append(line_text)
        return ' '.join(extracted_text)
    except Exception as e:
        print(f"Error in OCR processing: {str(e)}")
        return ""

def read_text_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except UnicodeDecodeError:
        try:
            with open(file_path, 'r', encoding='latin-1') as file:
                return file.read()
        except Exception as e:
            print(f"Error reading file: {str(e)}")
            return ""

def calculate_metrics(extracted_text, reference_text):
    extracted_words = extracted_text.lower().split()
    reference_words = reference_text.lower().split()
    extracted_words_set = set(extracted_words)
    reference_words_set = set(reference_words)

    correct_words = len(extracted_words_set.intersection(reference_words_set))

    precision = correct_words / len(extracted_words) if extracted_words else 0.0
    recall = correct_words / len(reference_words) if reference_words else 0.0
    f1_score = calculate_f1(precision, recall)

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'reference_count': len(reference_words),
        'extracted_count': len(extracted_words),
        'correct_count': correct_words
    }

def calculate_f1(precision, recall):
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

def print_text_comparison(filename, reference_text, extracted_text):
    print("\n" + "="*50)
    print(f"DETAILED TEXT COMPARISON FOR: {filename}")
    print("="*50)

    print("\nREFERENCE TEXT (Ground Truth):")
    print("-"*30)
    print(reference_text)

    print("\nEXTRACTED TEXT (OCR Result):")
    print("-"*30)
    print(extracted_text)

    print("\nWORD COUNT COMPARISON:")
    print(f"Reference text word count: {len(reference_text.split())}")
    print(f"Extracted text word count: {len(extracted_text.split())}")

# Directories containing the images and text files
image_dir = r'/content/images_analyse'
text_dir = r'/content/ground_truth'

# Variables to track metrics
total_precision = 0.0
total_recall = 0.0
total_f1 = 0.0
num_files = 0
total_execution_time = 0.0

# List of supported image extensions
image_extensions = ('.jpg', '.jpeg', '.png')

# Store results for each file
results = []

# Store total word counts
total_reference_words = 0
total_extracted_words = 0
total_correct_words = 0

print("\nStarting OCR evaluation...")
print("="*50)

# Iterate over all files in the image directory
for image_filename in os.listdir(image_dir):
    if image_filename.lower().endswith(image_extensions):
        base_filename = os.path.splitext(image_filename)[0]
        image_path = os.path.join(image_dir, image_filename)
        text_file_path = os.path.join(text_dir, f"{base_filename}.txt")

        if os.path.exists(text_file_path):
            try:
                print(f"\nProcessing {image_filename}...")
                start_time = time.time()

                extracted_text = extract_text_from_image(image_path)
                reference_text = read_text_file(text_file_path)

                # Print full text comparison
                print_text_comparison(image_filename, reference_text, extracted_text)

                # Calculate metrics
                metrics = calculate_metrics(extracted_text, reference_text)

                # Update totals
                total_precision += metrics['precision']
                total_recall += metrics['recall']
                total_f1 += metrics['f1_score']
                total_reference_words += metrics['reference_count']
                total_extracted_words += metrics['extracted_count']
                total_correct_words += metrics['correct_count']
                num_files += 1

                execution_time = time.time() - start_time

                # Store results
                results.append({
                    'filename': image_filename,
                    'metrics': metrics,
                    'execution_time': execution_time
                })

                # Print individual file metrics
                print(f"\nMETRICS FOR: {image_filename}")
                print(f"Reference words: {metrics['reference_count']}")
                print(f"Extracted words: {metrics['extracted_count']}")
                print(f"Correct words: {metrics['correct_count']}")
                print(f"Precision: {metrics['precision']:.2f}")
                print(f"Recall: {metrics['recall']:.2f}")
                print(f"F1 Score: {metrics['f1_score']:.2f}")
                print(f"Execution Time: {execution_time:.2f} seconds")

            except Exception as e:
                print(f"Error processing {image_filename}: {str(e)}")
        else:
            print(f"Text file not found for image: {image_filename}")

# Calculate and output the averages
if num_files > 0:
    average_precision = total_precision / num_files
    average_recall = total_recall / num_files
    average_f1 = total_f1 / num_files
    average_execution_time = sum(r['execution_time'] for r in results) / num_files

    print("\n" + "="*50)
    print("FINAL SUMMARY STATISTICS:")
    print("="*50)
    print(f"Number of files processed: {num_files}")

    print("\nTotal Word Counts:")
    print(f"Total Reference Words: {total_reference_words}")
    print(f"Total Extracted Words: {total_extracted_words}")
    print(f"Total Correct Words: {total_correct_words}")

    print(f"\nAverage Metrics:")
    print(f"Average Precision: {average_precision:.2f}")
    print(f"Average Recall: {average_recall:.2f}")
    print(f"Average F1 Score: {average_f1:.2f}")
    print(f"Average Execution Time: {average_execution_time:.2f} seconds per file")

    # Find best and worst performing files
    best_f1 = max(results, key=lambda x: x['metrics']['f1_score'])
    worst_f1 = min(results, key=lambda x: x['metrics']['f1_score'])

    print("\nBest performing file:")
    print(f"Filename: {best_f1['filename']}")
    print(f"F1 Score: {best_f1['metrics']['f1_score']:.2f}")
    print(f"Reference words: {best_f1['metrics']['reference_count']}")
    print(f"Extracted words: {best_f1['metrics']['extracted_count']}")
    print(f"Correct words: {best_f1['metrics']['correct_count']}")

    print("\nWorst performing file:")
    print(f"Filename: {worst_f1['filename']}")
    print(f"F1 Score: {worst_f1['metrics']['f1_score']:.2f}")
    print(f"Reference words: {worst_f1['metrics']['reference_count']}")
    print(f"Extracted words: {worst_f1['metrics']['extracted_count']}")
    print(f"Correct words: {worst_f1['metrics']['correct_count']}")
else:
    print("No matching text files found for any images.")

Initializing PaddleOCR...
[2024/10/31 06:48:22] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_

**Prétraitement**

In [None]:
import os
import time
import cv2
import numpy as np
from paddleocr import PaddleOCR

def preprocess_image(image):
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image.copy()

    # Réduction de bruit plus légère
    denoised = cv2.fastNlMeansDenoising(gray, h=10)

    # CLAHE avec paramètres doux
    clahe = cv2.createCLAHE(clipLimit=1.5, tileGridSize=(4,4))
    enhanced = clahe.apply(denoised)

    # Seuillage global
    _, binary = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Redimensionnement conservateur
    min_height = 1200
    if image.shape[0] < min_height:
        ratio = min_height / image.shape[0]
        new_width = int(image.shape[1] * ratio)
        resized = cv2.resize(binary, (new_width, min_height),
                           interpolation=cv2.INTER_LINEAR)
    else:
        resized = binary

    # Amélioration des bords
    kernel = np.ones((2,2),np.uint8)
    final = cv2.morphologyEx(resized, cv2.MORPH_CLOSE, kernel)

    return final

print("Initializing PaddleOCR...")
ocr = PaddleOCR(
    use_angle_cls=True,
    lang='french',
    det_db_thresh=0.25,
    det_db_box_thresh=0.5,
    det_limit_side_len=1200,
    rec_batch_num=1,
    use_gpu=False,
    enable_mkldnn=True,
    cls_thresh=0.8,
    rec_image_shape='3, 48, 320'
)
print("PaddleOCR initialized.")

def extract_text_from_image(image_path):
    try:
        # Lire et prétraiter l'image
        image = cv2.imread(image_path)
        preprocessed = preprocess_image(image)

        # Sauvegarder temporairement l'image prétraitée
        temp_path = "temp_preprocessed.png"
        cv2.imwrite(temp_path, preprocessed)

        # OCR sur l'image prétraitée
        result = ocr.ocr(temp_path, cls=True)

        # Supprimer le fichier temporaire
        if os.path.exists(temp_path):
            os.remove(temp_path)

        if result is None or len(result) == 0:
            return ""

        extracted_text = []
        for line in result:
            line_text = ' '.join([word_info[1][0] for word_info in line])
            extracted_text.append(line_text)
        return ' '.join(extracted_text)
    except Exception as e:
        print(f"Error in OCR processing: {str(e)}")
        return ""

def read_text_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except UnicodeDecodeError:
        try:
            with open(file_path, 'r', encoding='latin-1') as file:
                return file.read()
        except Exception as e:
            print(f"Error reading file: {str(e)}")
            return ""

def calculate_metrics(extracted_text, reference_text):
    extracted_words = extracted_text.lower().split()
    reference_words = reference_text.lower().split()
    extracted_words_set = set(extracted_words)
    reference_words_set = set(reference_words)

    correct_words = len(extracted_words_set.intersection(reference_words_set))

    precision = correct_words / len(extracted_words) if extracted_words else 0.0
    recall = correct_words / len(reference_words) if reference_words else 0.0
    f1_score = calculate_f1(precision, recall)

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'reference_count': len(reference_words),
        'extracted_count': len(extracted_words),
        'correct_count': correct_words
    }

def calculate_f1(precision, recall):
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

def print_text_comparison(filename, reference_text, extracted_text):
    print("\n" + "="*50)
    print(f"DETAILED TEXT COMPARISON FOR: {filename}")
    print("="*50)

    print("\nREFERENCE TEXT (Ground Truth):")
    print("-"*30)
    print(reference_text)

    print("\nEXTRACTED TEXT (OCR Result):")
    print("-"*30)
    print(extracted_text)

    print("\nWORD COUNT COMPARISON:")
    print(f"Reference text word count: {len(reference_text.split())}")
    print(f"Extracted text word count: {len(extracted_text.split())}")

# Directories containing the images and text files
image_dir = r'/content/images_analyse'
text_dir = r'/content/ground_truth'

# Variables to track metrics
total_precision = 0.0
total_recall = 0.0
total_f1 = 0.0
num_files = 0
total_execution_time = 0.0

# List of supported image extensions
image_extensions = ('.jpg', '.jpeg', '.png')

# Store results for each file
results = []

# Store total word counts
total_reference_words = 0
total_extracted_words = 0
total_correct_words = 0

print("\nStarting OCR evaluation...")
print("="*50)

# Iterate over all files in the image directory
for image_filename in os.listdir(image_dir):
    if image_filename.lower().endswith(image_extensions):
        base_filename = os.path.splitext(image_filename)[0]
        image_path = os.path.join(image_dir, image_filename)
        text_file_path = os.path.join(text_dir, f"{base_filename}.txt")

        if os.path.exists(text_file_path):
            try:
                print(f"\nProcessing {image_filename}...")
                start_time = time.time()

                extracted_text = extract_text_from_image(image_path)
                reference_text = read_text_file(text_file_path)

                # Print full text comparison
                print_text_comparison(image_filename, reference_text, extracted_text)

                # Calculate metrics
                metrics = calculate_metrics(extracted_text, reference_text)

                # Update totals
                total_precision += metrics['precision']
                total_recall += metrics['recall']
                total_f1 += metrics['f1_score']
                total_reference_words += metrics['reference_count']
                total_extracted_words += metrics['extracted_count']
                total_correct_words += metrics['correct_count']
                num_files += 1

                execution_time = time.time() - start_time

                # Store results
                results.append({
                    'filename': image_filename,
                    'metrics': metrics,
                    'execution_time': execution_time
                })

                # Print individual file metrics
                print(f"\nMETRICS FOR: {image_filename}")
                print(f"Reference words: {metrics['reference_count']}")
                print(f"Extracted words: {metrics['extracted_count']}")
                print(f"Correct words: {metrics['correct_count']}")
                print(f"Precision: {metrics['precision']:.2f}")
                print(f"Recall: {metrics['recall']:.2f}")
                print(f"F1 Score: {metrics['f1_score']:.2f}")
                print(f"Execution Time: {execution_time:.2f} seconds")

            except Exception as e:
                print(f"Error processing {image_filename}: {str(e)}")
        else:
            print(f"Text file not found for image: {image_filename}")

# Calculate and output the averages
if num_files > 0:
    average_precision = total_precision / num_files
    average_recall = total_recall / num_files
    average_f1 = total_f1 / num_files
    average_execution_time = sum(r['execution_time'] for r in results) / num_files

    print("\n" + "="*50)
    print("FINAL SUMMARY STATISTICS:")
    print("="*50)
    print(f"Number of files processed: {num_files}")

    print("\nTotal Word Counts:")
    print(f"Total Reference Words: {total_reference_words}")
    print(f"Total Extracted Words: {total_extracted_words}")
    print(f"Total Correct Words: {total_correct_words}")

    print(f"\nAverage Metrics:")
    print(f"Average Precision: {average_precision:.2f}")
    print(f"Average Recall: {average_recall:.2f}")
    print(f"Average F1 Score: {average_f1:.2f}")
    print(f"Average Execution Time: {average_execution_time:.2f} seconds per file")

    # Find best and worst performing files
    best_f1 = max(results, key=lambda x: x['metrics']['f1_score'])
    worst_f1 = min(results, key=lambda x: x['metrics']['f1_score'])

    print("\nBest performing file:")
    print(f"Filename: {best_f1['filename']}")
    print(f"F1 Score: {best_f1['metrics']['f1_score']:.2f}")
    print(f"Reference words: {best_f1['metrics']['reference_count']}")
    print(f"Extracted words: {best_f1['metrics']['extracted_count']}")
    print(f"Correct words: {best_f1['metrics']['correct_count']}")

    print("\nWorst performing file:")
    print(f"Filename: {worst_f1['filename']}")
    print(f"F1 Score: {worst_f1['metrics']['f1_score']:.2f}")
    print(f"Reference words: {worst_f1['metrics']['reference_count']}")
    print(f"Extracted words: {worst_f1['metrics']['extracted_count']}")
    print(f"Correct words: {worst_f1['metrics']['correct_count']}")
else:
    print("No matching text files found for any images.")

Initializing PaddleOCR...
[2024/10/31 06:23:12] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=1200, det_limit_type='max', det_box_type='quad', det_db_thresh=0.25, det_db_box_thresh=0.5, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batc