In [None]:
!pip uninstall spellchecker



^C
Collecting pyspellchecker
  Obtaining dependency information for pyspellchecker from https://files.pythonhosted.org/packages/99/8e/7c79443d302a80cfd59bc365938d51e36e7e9aa7ce8ab1d8a0ca0c8e6065/pyspellchecker-0.8.2-py3-none-any.whl.metadata
  Downloading pyspellchecker-0.8.2-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.2-py3-none-any.whl (7.1 MB)
   ---------------------------------------- 0.0/7.1 MB ? eta -:--:--
    --------------------------------------- 0.2/7.1 MB 3.1 MB/s eta 0:00:03
   - -------------------------------------- 0.2/7.1 MB 2.4 MB/s eta 0:00:03
   -- ------------------------------------- 0.4/7.1 MB 2.6 MB/s eta 0:00:03
   --- ------------------------------------ 0.6/7.1 MB 3.1 MB/s eta 0:00:03
   ------ --------------------------------- 1.2/7.1 MB 4.3 MB/s eta 0:00:02
   -------- ------------------------------- 1.5/7.1 MB 4.9 MB/s eta 0:00:02
   ---------- ----------------------------- 1.9/7.1 MB 5.4 MB/s eta 0:00:01
   ------------- -----------

In [None]:
!pip install pyspellchecker

In [None]:
import re
import math
from spellchecker import SpellChecker
from collections import Counter

def load_text_from_file(file_path):
    """ Load text from a TXT file """
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read().strip()

def compute_text_entropy(text):
    """ Computes Shannon entropy of the text """
    text = text.lower()
    frequency = Counter(text)
    total_chars = len(text)
    
    entropy = -sum((count / total_chars) * math.log2(count / total_chars) for count in frequency.values())
    return entropy

def evaluate_ocr_spellcheck(text):
    """ Evaluates OCR quality using spell-check error rate (French) """
    spell = SpellChecker(language="fr")  # Set language to French
    words = re.findall(r'\b[a-zA-Z]+\b', text.lower())  # Extract words
    misspelled = spell.unknown(words)
    error_rate = len(misspelled) / max(1, len(words))  # Avoid division by zero
    print(f"[Spell Check - French] Error Rate: {error_rate:.2%} (Misspelled: {len(misspelled)} / {len(words)})")
    return error_rate


def main(ocr_txt_path, reference_txt_path=None):
    """ Run all OCR quality evaluation methods """
    print("🔍 Evaluating OCR Quality...\n")

    # Load OCR text
    ocr_text = load_text_from_file(ocr_txt_path)

    # 1️⃣ Entropy-based evaluation
    entropy_score = compute_text_entropy(ocr_text)
    print(f"[Entropy] OCR Entropy Score: {entropy_score:.4f}")

    # 2️⃣ Spell-check evaluation
    spell_error_rate = evaluate_ocr_spellcheck(ocr_text)

    print("\n✅ Evaluation Complete!")
    return {
        "entropy_score": entropy_score,
        "spell_error_rate": spell_error_rate
    }


ocr_txt_path = "extracted_text_2.txt"  

results = main(ocr_txt_path)


🔍 Evaluating OCR Quality...

[Entropy] OCR Entropy Score: 4.5732
[Spell Check - French] Error Rate: 2.70% (Misspelled: 1845 / 68272)

✅ Evaluation Complete!


In [None]:

ocr_txt_path = "extracted_text_12.txt"  

results = main(ocr_txt_path)


🔍 Evaluating OCR Quality...

[Entropy] OCR Entropy Score: 4.9034
[Spell Check - French] Error Rate: 8.86% (Misspelled: 7188 / 81113)

✅ Evaluation Complete!


In [None]:

ocr_txt_path = "extracted_text_12_BW.txt"  

results = main(ocr_txt_path)


🔍 Evaluating OCR Quality...

[Entropy] OCR Entropy Score: 4.5612
[Spell Check - French] Error Rate: 9.18% (Misspelled: 7843 / 85436)

✅ Evaluation Complete!
