In [1]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()


In [2]:
from difflib import SequenceMatcher
from collections import Counter

def calculate_accuracy_and_errors(ocr_text, corrected_text):
    ocr_words = ocr_text.split()
    corrected_words = corrected_text.split()
    ocr_chars = list(ocr_text)
    corrected_chars = list(corrected_text)

    word_matches = sum(1 for o, c in zip(ocr_words, corrected_words) if o == c)
    word_accuracy = (word_matches / len(corrected_words)) * 100 if corrected_words else 0

    char_matches = sum(1 for o, c in zip(ocr_chars, corrected_chars) if o == c)
    char_accuracy = (char_matches / len(corrected_chars)) * 100 if corrected_chars else 0

    sm = SequenceMatcher(None, ocr_text, corrected_text)
    errors = []
    for tag, i1, i2, j1, j2 in sm.get_opcodes():
        if tag == 'replace':
            errors.append(('Substitution', ocr_text[i1:i2], corrected_text[j1:j2]))
        elif tag == 'delete':
            errors.append(('Deletion', ocr_text[i1:i2], ''))
        elif tag == 'insert':
            errors.append(('Insertion', '', corrected_text[j1:j2]))
    
    error_counts = Counter(err[0] for err in errors)
    total_errors = sum(error_counts.values())

    error_percentages = {error: (count / total_errors) * 100 for error, count in error_counts.items()}
    
    return word_accuracy, char_accuracy, errors, error_percentages


In [3]:
ocr_file_name = 'page_53.png-053.txt'  
corrected_file_name = 'corrected.txt'

ocr_text = read_file(ocr_file_name)
corrected_text = read_file(corrected_file_name)

word_acc, char_acc, errors, error_percentages = calculate_accuracy_and_errors(ocr_text, corrected_text)

print(f"Word Accuracy: {word_acc:.2f}%")
print(f"Character Accuracy: {char_acc:.2f}%")
print("\nError Types and Examples:")
for error_type, ocr_segment, corrected_segment in errors[:10]:  
    print(f"{error_type}: OCR='{ocr_segment}' | Corrected='{corrected_segment}'")
print("\nError Percentages:")
for error_type, percentage in error_percentages.items():
    print(f"{error_type}: {percentage:.2f}%")


Word Accuracy: 1.82%
Character Accuracy: 5.31%

Error Types and Examples:
Substitution: OCR=' 

' | Corrected='591: गाबत्याक गोरूं ; भट्टाक तारूं. ं
592. गायक मारली म्हुण वासराक मारतात वे ?......
59'
Substitution: OCR='91.
592.
593' | Corrected='. गायक विवचों वेछु ; पाड्याक बायल करचो वे'
Substitution: OCR='

595.
596.
$' | Corrected=' गायचे वासरा लागी पढ़यार तण खाता ; सूष्या
 पेट्या लागी पढुयार यू खाता. ।
595. गायचो पाड्डो दाकै वाड़ो. भ
596. गायता गायता गांवकार जाता. ...... .. ... :
5'
Insertion: OCR='' | Corrected=' गांव करचा सावकाराक हागूंक ना भाट. . .'
Substitution: OCR='
599' | Corrected=' गांवचे भायर वचनातील्याक सुंका .दर इत्याक ? ...
599. गांवचों गांव धांवताना महेंतु धांवका. ... : '
Substitution: OCR='
' | Corrected=' गांवदो चोरू पुगी जायत ; परगांववों. सावकारू
। न्हूंय. व लि दी |'
Insertion: OCR='' | Corrected=' गांवचो रायु जाल्यारीय आवसुक चेरइ. . ... .'
Substitution: OCR='
' | Corrected=' गांव भरि देवांक पांय पढ़प्रारीय घोवू नातीलें .
वेरड जाता वे ? लक'
Insertion: OCR='' | Corre