#  OCR + NER Correction System


This notebook demonstrates an advanced Natural Language Processing (NLP) pipeline that combines Optical Character Recognition (OCR) and Named Entity Recognition (NER) for processing mortgage documents. The system employs several  NLP techniques and processes.

In [47]:
# Install required packages
#!pip install -r requirements.txt

## Import Libraries

In [48]:
# Essential imports only
import logging
import re
import requests
import time
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
from collections import defaultdict, Counter
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForMaskedLM, pipeline
import spacy
import pandas as pd
from symspellpy import SymSpell, Verbosity
from difflib import SequenceMatcher
import numpy as np
import os

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Load the Excel file

In [49]:
"""
Load and display mortgage offer data from an Excel file.

This code loads mortgage offer data from an Excel file and displays information
about the dataset including its shape and column names. The data contains
OCR-processed text from mortgage documents.

The Excel file should contain the following columns:
- row_id: Unique identifier for each row
- doc_id: Document identifier
- ocr_para_id: Paragraph identifier within the document
- ocr_para_type: Type of paragraph (e.g., Title, Paragraph)
- ocr_para_text: Text content from OCR processing

Returns:
    pandas.DataFrame: A DataFrame containing the loaded mortgage offer data

Raises:
    Exception: If there is an error loading the Excel file
"""
try:
    # Load the Excel file
    df = pd.read_excel("LMS_Technical_Test_DATA_FOR_CANDIDATES.xlsx")
    
    # Display basic information about the dataset
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
except Exception as e:
    print(f"Error loading Excel file: {e}")

# Display the first 5 rows
df.head(5)

Dataset shape: (164, 5)
Columns: ['row_id', 'doc_id', 'ocr_para_id', 'ocr_para_type', 'ocr_para_text']


Unnamed: 0,row_id,doc_id,ocr_para_id,ocr_para_type,ocr_para_text
0,1,1,1,Title,Your Mortgage Offer
1,2,1,2,Paragraph,Mr Mickey Mouse and Ms Minnie M0use
2,3,1,3,Paragraph,04/03/202S
3,4,1,4,Paragraph,B00123456789-998877665544
4,5,1,5,Paragraph,Thismortgage will be secured by a first legal charge over the property below:


## Explanatory Data Analysis

In [50]:
def analyze_text_errors(df, column='ocr_para_text'):
    """
    Analyze potential OCR errors and text quality issues in the data.
    
    This function performs exploratory data analysis on text data to identify:
    - Extra whitespace
    - Missing spaces between words
    - Digit/letter substitutions
    - Special character issues
    - Case inconsistencies
    
    Args:
        df (pandas.DataFrame): Input DataFrame
        column (str): Name of text column to analyze
        
    Returns:
        dict: Dictionary containing error statistics and examples
    """
    error_stats = {
        'extra_whitespace': 0,
        'missing_spaces': 0,
        'digit_letter_mix': 0,
        'special_chars': 0,
        'case_issues': 0,
        'examples': {
            'extra_whitespace': [],
            'missing_spaces': [], 
            'digit_letter_mix': [],
            'special_chars': [],
            'case_issues': []
        }
    }
    
    for idx, text in enumerate(df[column]):
        if not isinstance(text, str):
            continue
            
        # Check for multiple consecutive spaces
        if re.search(r'\s{2,}', text):
            error_stats['extra_whitespace'] += 1
            if len(error_stats['examples']['extra_whitespace']) < 3:
                error_stats['examples']['extra_whitespace'].append(text)
        
        # Check for missing spaces between words
        if re.search(r'[a-z][A-Z]', text):
            error_stats['missing_spaces'] += 1
            if len(error_stats['examples']['missing_spaces']) < 3:
                error_stats['examples']['missing_spaces'].append(text)
                
        # Check for digit/letter substitutions (e.g. 0 for O)
        if re.search(r'[A-Za-z][0-9]|[0-9][A-Za-z]', text):
            error_stats['digit_letter_mix'] += 1
            if len(error_stats['examples']['digit_letter_mix']) < 3:
                error_stats['examples']['digit_letter_mix'].append(text)
                
        # Check for unusual special characters
        if re.search(r'[^a-zA-Z0-9\s\.,\-\'\"£$%&():]', text):
            error_stats['special_chars'] += 1
            if len(error_stats['examples']['special_chars']) < 3:
                error_stats['examples']['special_chars'].append(text)
                
        # Check for inconsistent casing
        words = text.split()
        for word in words:
            if len(word) > 1 and not (word.islower() or word.isupper() or word.istitle()):
                error_stats['case_issues'] += 1
                if len(error_stats['examples']['case_issues']) < 3:
                    error_stats['examples']['case_issues'].append(word)
                break
                
    # Calculate percentages
    total_rows = len(df)
    error_stats['error_percentages'] = {
        'extra_whitespace': round(error_stats['extra_whitespace']/total_rows * 100, 2),
        'missing_spaces': round(error_stats['missing_spaces']/total_rows * 100, 2),
        'digit_letter_mix': round(error_stats['digit_letter_mix']/total_rows * 100, 2),
        'special_chars': round(error_stats['special_chars']/total_rows * 100, 2),
        'case_issues': round(error_stats['case_issues']/total_rows * 100, 2)
    }
    
    return error_stats

# Analyze text errors
error_analysis = analyze_text_errors(df)

In [51]:
# Print summary statistics
print("Text Error Analysis Summary:")
print(f"Total rows analyzed: {len(df)}\n")
print("Error Percentages:")
for error_type, percentage in error_analysis['error_percentages'].items():
    print(f"{error_type}: {percentage}%")

Text Error Analysis Summary:
Total rows analyzed: 164

Error Percentages:
extra_whitespace: 5.49%
missing_spaces: 2.44%
digit_letter_mix: 13.41%
special_chars: 8.54%
case_issues: 40.85%


In [52]:
print("\nExample OCR errors found:")
for error_type, examples in error_analysis['examples'].items():
    if examples:
        print(f"\n{error_type}:")
        for ex in examples:
            print(f"  - {ex}")

# Additional analysis of text lengths and patterns
df['text_length'] = df['ocr_para_text'].astype(str).apply(len)
df['word_count'] = df['ocr_para_text'].astype(str).apply(lambda x: len(x.split()))


Example OCR errors found:

extra_whitespace:
  - Your     Mortgage Offer
  - Thismortgage will    be secured by a first legal   charge over    the property below:
  - You should        save a copy of this          offer for          your records.

missing_spaces:
  - If you have an existing mortgage from HSBC securedon the property which will remain in place, any part or parts of that mortgage that are to remain unchanged will also be included within this offer. Please note, however, that this part or these parts will continue to be subject to the HSBC Mortgage Loan Terms and Conditions, and/or, the HSBC Mortgage Deed Conditions (or inScotland the Scottish Residential Security Conditions, or in NorthernIreland the Mortgage Deed Conditions Northern Ireland that applied at the time of the original offer. The Conditions will apply to any other part or parts (for example, anynew money we advance under this offer). This is explainedfurther in the Conditions.
  - Where you're purchasingthep

In [53]:
print("\nText Length Statistics:")
print(df['text_length'].describe())


Text Length Statistics:
count    164.000000
mean     117.945122
std      117.470699
min        4.000000
25%       28.750000
50%       83.000000
75%      165.250000
max      699.000000
Name: text_length, dtype: float64


In [54]:
print("\nWord Count Statistics:")
print(df['word_count'].describe())


Word Count Statistics:
count    164.000000
mean      20.024390
std       20.120786
min        1.000000
25%        5.000000
50%       13.000000
75%       29.000000
max      113.000000
Name: word_count, dtype: float64


In [55]:
# Analyze paragraph types
print("\nParagraph Type Distribution:")
print(df['ocr_para_type'].value_counts())


Paragraph Type Distribution:
ocr_para_type
Paragraph    132
Title         32
Name: count, dtype: int64


## Global Variables

In [56]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

# Dictionary path
dictionary_path = "frequency_dictionary_en_82_765.txt"

# Initialize SymSpell with a larger prefix length
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)


True

## Word Frequency


In [57]:
def analyze_word_frequency(df, column='ocr_para_text', min_word_length=2, min_frequency=5):
    """
    Analyze word frequency in a dataframe column.
    
    This function processes text data from a specified column in a DataFrame to analyze word frequencies.
    It filters out stopwords, short words, and creates both a frequency DataFrame and a list of frequent words.
    
    Args:
        df (pandas.DataFrame): Input DataFrame containing text data
        column (str, optional): Name of the column containing text data. Defaults to 'ocr_para_text'
        min_word_length (int, optional): Minimum length of words to include. Defaults to 2
        min_frequency (int, optional): Minimum frequency threshold for "frequent" words. Defaults to 3
        
    Returns:
        tuple: A tuple containing:
            - pandas.DataFrame: DataFrame with columns 'word' and 'count', sorted by frequency
            - list: List of words that appear at least min_frequency times
            
    """
    
    # Combine all text from the column
    all_text = ' '.join(df[column].astype(str))
    
    # Extract words, filter by length and remove stopwords
    words = re.findall(r'\b[a-zA-Z]+\b', all_text.lower())
    filtered_words = [word for word in words if len(word) >= min_word_length and word not in stopwords]
    
    # Count word frequencies
    word_counts = Counter(filtered_words)
    
    # Create DataFrame with word frequencies
    word_freq_df = pd.DataFrame(word_counts.items(), columns=['word', 'count'])
    word_freq_df = word_freq_df.sort_values('count', ascending=False).reset_index(drop=True)
    
    # Create list of frequent words (appearing at least min_frequency times)
    frequent_words = [word for word, count in word_counts.items() if count >= min_frequency]
    
    return word_freq_df, frequent_words

# Find most frequent words in the batch
word_freq_df, frequent_words = analyze_word_frequency(df, min_word_length=3, min_frequency=5)

# Display results
print(f"{len(word_freq_df)} unique words")
print(f"{len(frequent_words)} frequent words (appearing 5+ times)")
print("\nFrequent words:")
print(frequent_words)

551 unique words
58 frequent words (appearing 5+ times)

Frequent words:
['mortgage', 'offer', 'legal', 'charge', 'property', 'scotland', 'standard', 'security', 'hsbc', 'bank', 'plc', 'information', 'set', 'illustration', 'shown', 'loan', 'terms', 'conditions', 'agreement', 'contact', 'time', 'apply', 'repay', 'ofler', 'date', 'end', 'period', 'change', 'provided', 'repayment', 'interest', 'rate', 'fixed', 'variable', 'loans', 'effect', 'document', 'completion', 'lender', 'section', 'costs', 'right', 'early', 'details', 'transfer', 'rights', 'complaint', 'financial', 'service', 'fees', 'annual', 'aprc', 'payments', 'monthly', 'additional', 'obligations', 'maintain', 'insurance']


## Creating Domain Specific Dictionary

### Download Frequency Dictionary

In [58]:
def download_frequency_dictionary(dict_url="https://github.com/ELI-Data-Mining-Group/PELIC-spelling/raw/refs/heads/master/frequency_dictionary_en_82_765.txt"):
    """
    Downloads frequency dictionary and returns it as a dictionary object.
    
    Args:
        dict_url (str): URL of the frequency dictionary file
        
    Returns:
        dict: Dictionary mapping words to their frequencies
            
    """
    # Download dictionary file
    try:
        response = requests.get(dict_url)
        response.raise_for_status()
        dict_content = response.text
        
        # Save to root folder
        with open("frequency_dictionary_en_82_765.txt", "w", encoding="utf-8") as f:
            f.write(dict_content)
        
    except Exception as e:
        print(f"Error downloading dictionary: {e}")
        return {}
    
    # Parse dictionary into word:frequency pairs
    frequency_dict = {}
    for line in dict_content.split('\n'):
        if line.strip():
            word, freq = line.strip().split(' ')
            frequency_dict[word] = int(freq)
            
    return frequency_dict

# Download frequency dictionary
frequency_dict = download_frequency_dictionary()
print(f"\nDownloaded dictionary with {len(frequency_dict)} words")

# Words in frequent_words that are not in the dictionary
domain_words = [word for word in frequent_words if word not in frequency_dict]
print(f"\n{len(domain_words)} words not in dictionary:")
print(domain_words)



Downloaded dictionary with 82765 words

4 words not in dictionary:
['hsbc', 'plc', 'ofler', 'aprc']


### Word Frequency

In [59]:
def find_highest_count_word(frequent_words, frequency_dict_path):
    """
    Find the word from frequent_words list that has the highest count 
    in the frequency dictionary file.
    
    Args:
        frequent_words (list): List of words to check
        frequency_dict_path (str): Path to the frequency dictionary file
        
    Returns:
        tuple: (word, count) of the word with highest frequency
    """
    
    # Dictionary to store the counts of our frequent words
    word_counts = {}
    
    # Read the frequency dictionary file
    with open(frequency_dict_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Each line format: "word count"
            parts = line.strip().split()
            if len(parts) == 2:
                word = parts[0]
                count = int(parts[1])
                
                # Check if this word is in our frequent_words list
                if word in frequent_words:
                    word_counts[word] = count
    
    # Find the word with the highest count
    if word_counts:
        highest_word = max(word_counts, key=word_counts.get)
        highest_count = word_counts[highest_word]
        
        print(f"Word counts found for frequent_words:")
        # Sort by count (descending) for better readability
        for word, count in sorted(word_counts.items(), key=lambda x: x[1], reverse=True):
            print(f"  {word}: {count:,}")
        
        print(f"\nHighest count word: '{highest_word}' with {highest_count:,} occurrences")
        
        return highest_word, highest_count
    else:
        print("No words from frequent_words list found in frequency dictionary")
        return None, 0


# Find the highest count word
highest_word, highest_count = find_highest_count_word(frequent_words, 'frequency_dictionary_en_82_765.txt')

Word counts found for frequent_words:
  information: 932,594,387
  time: 908,705,570
  contact: 645,824,184
  service: 519,537,222
  date: 488,967,374
  rights: 352,051,342
  set: 313,469,591
  details: 280,827,841
  terms: 277,705,910
  right: 273,620,358
  section: 232,251,956
  security: 230,014,019
  end: 220,812,328
  change: 210,601,244
  rate: 207,634,179
  insurance: 193,271,293
  property: 191,783,393
  conditions: 168,957,006
  provided: 159,785,849
  financial: 148,330,257
  standard: 147,177,212
  legal: 142,048,771
  additional: 132,187,702
  offer: 126,228,968
  interest: 120,272,948
  agreement: 111,356,320
  period: 103,906,182
  costs: 101,090,970
  early: 100,013,194
  document: 98,703,772
  bank: 91,559,349
  loan: 87,785,549
  annual: 85,043,687
  loans: 80,821,333
  apply: 76,341,478
  effect: 75,594,829
  mortgage: 74,160,962
  shown: 68,755,779
  transfer: 61,011,470
  charge: 56,542,418
  fees: 54,965,019
  fixed: 49,215,402
  monthly: 46,198,876
  variable: 34,

### Increase Priority of Frequent words

In [60]:
def boost_frequent_words(frequent_words, frequency_dict_path, domain_words=None, boost_factor=1000):
    """
    Increase priority in frequent_words list by boost_factor and update the frequency dictionary file.
    Excludes any words that appear in domain_words.
    """
    # Convert to sets for faster lookups
    frequent_words_set = set(frequent_words)
    domain_words_set = set(domain_words) if domain_words else set()
    
    # Remove domain words from frequent words
    frequent_words_set = frequent_words_set - domain_words_set
    
    boosted_dict = {}
    boosted_count = 0
    
    print(f"Boosting {len(frequent_words_set)} frequent words by {boost_factor}x...")
    print(f"Excluded {len(domain_words_set)} domain words from boosting")
    
    # First read all entries into memory
    with open(frequency_dict_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 2:
                word, count = parts[0], int(parts[1])
                
                # Boost if it's in frequent_words and not in domain_words
                if word in frequent_words_set:
                    count *= boost_factor
                    boosted_count += 1
                
                boosted_dict[word] = count
    
    # Write back to file with boosted frequencies
    with open(frequency_dict_path, 'w', encoding='utf-8') as file:
        for word, count in boosted_dict.items():
            file.write(f"{word} {count}\n")
            
    print(f"Updated frequency dictionary with {boosted_count} boosted words")
    return boosted_dict

# Boost the frequencies
boosted_dict = boost_frequent_words(frequent_words, 'frequency_dictionary_en_82_765.txt', domain_words=domain_words, boost_factor=1000)


Boosting 54 frequent words by 1000x...
Excluded 4 domain words from boosting
Updated frequency dictionary with 54 boosted words


### Prioritize Protected words in the Dictionary

In [61]:
def boost_protected_words(protected_words, frequency_dict_path):
    """
    Give all protected words the highest frequency and priority in the dictionary.
    """
    # First pass: find the highest frequency
    max_frequency = 0
    all_words = {}
    
    print("Finding highest frequency in dictionary...")
    with open(frequency_dict_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 2:
                word, count = parts[0], int(parts[1])
                all_words[word] = count
                if count > max_frequency:
                    max_frequency = count
    
    print(f"Highest frequency found: {max_frequency:,}")
    
    # Boost protected words to highest frequency
    boosted_count = 0
    for word in protected_words:
        if word in all_words:
            old_freq = all_words[word]
            all_words[word] = max_frequency
            print(f"  Boosted: {word} from {old_freq:,} to {max_frequency:,}")
            boosted_count += 1
        else:
            # Add word if it doesn't exist
            all_words[word] = max_frequency
            print(f"  Added: {word} with {max_frequency:,}")
            boosted_count += 1
    
    # Write updated frequencies back to file
    print("Writing updated frequencies to dictionary file...")
    with open(frequency_dict_path, 'w', encoding='utf-8') as file:
        for word, count in all_words.items():
            file.write(f"{word} {count}\n")
            
    print(f"Successfully boosted {boosted_count} protected words and updated dictionary")
    return all_words

# Protected words 
protected_words = {"hsbc", "uk", "plc", "aprc","mr", "ms", "mrs", "overpayments","fca", "www", "po", "box"}

# Boost protected words to highest frequency
boosted_dict = boost_protected_words(protected_words, 'frequency_dictionary_en_82_765.txt')

Finding highest frequency in dictionary...
Highest frequency found: 932,594,387,000
  Added: fca with 932,594,387,000
  Boosted: box from 169,231,297 to 932,594,387,000
  Added: ms with 932,594,387,000
  Added: uk with 932,594,387,000
  Added: mr with 932,594,387,000
  Boosted: mrs from 12,206,596 to 932,594,387,000
  Added: po with 932,594,387,000
  Added: hsbc with 932,594,387,000
  Added: www with 932,594,387,000
  Added: aprc with 932,594,387,000
  Added: overpayments with 932,594,387,000
  Added: plc with 932,594,387,000
Writing updated frequencies to dictionary file...
Successfully boosted 12 protected words and updated dictionary


## Spacing Detection

In [62]:
def segment_text(text):
    """
    Segments text by correcting spacing issues while preserving special cases.
    
    This function uses  word segmentation using SymSpell, but skips processing for:
    - Numbered list items (e.g. "13. additional information")
    - Short words (< 6 characters)
    - Numbers and codes
    - Pure punctuation
    
    Args:
        text (str): Input text with potential spacing issues
        
    Returns:
        str: Text with corrected word spacing
    """
    # Skip lines like "13. additional information"
    if re.match(r'^\d+\.\s+\w+', text.strip()):
        return text

    segmented_words = []
    for word in text.split():
        # Conditions to skip segmentation
        is_short = len(word) < 6
        is_mostly_digits = sum(c.isdigit() for c in word) >= len(word) * 0.5
        is_code_like = bool(re.match(r'[a-zA-Z]*\d{4,}[a-zA-Z\d]*', word))
        is_pure_punct = re.fullmatch(r'[\W_]+', word)

        if is_short or is_mostly_digits or is_code_like or is_pure_punct:
            segmented_words.append(word)
        else:
            # Only apply segmentation to word-like tokens
            segmented = sym_spell.word_segmentation(word).corrected_string
            segmented_words.append(segmented)

    return ' '.join(segmented_words)

# Apply to cleaned date-corrected text
df['spacing_ocr_para_text'] = df['ocr_para_text'].apply(segment_text)

# Show selected columns
display_cols = ['row_id','ocr_para_text', 'spacing_ocr_para_text']
df_spaced= df[display_cols]

In [63]:
df_spaced.head()

Unnamed: 0,row_id,ocr_para_text,spacing_ocr_para_text
0,1,Your Mortgage Offer,Your Mortgage Offer
1,2,Mr Mickey Mouse and Ms Minnie M0use,Mr Mickey Mouse and Ms Minnie M0use
2,3,04/03/202S,04/03/202S
3,4,B00123456789-998877665544,B00123456789-998877665544
4,5,Thismortgage will be secured by a first legal charge over the property below:,This mortgage will be secured by a first legal charge over the property below


## Digit Detection

In [64]:
# Define digit-to-letter OCR substitutions
ocr_substitutions = {
    '0': 'o',
    '1': 'l',
    '3': 'e',
    '5': 's',
    '6': 'g',
    '8': 'b',
    '9': 'g'
}

# Function to extract all words with at least one digit
def find_digit_words(text):
    return re.findall(r'\b\w*\d\w*\b', str(text))

# Replace digits in a word with corresponding letters
def substitute_ocr_digits(word):
    return ''.join(ocr_substitutions.get(c, c) for c in word)

# Rule to decide whether to apply substitution
def should_substitute(word):
    word = word.strip()

    if len(word) < 3:
        return False

    if re.fullmatch(r'\d+', word):
        return False  # pure numbers

    if re.fullmatch(r'[$£€]?\d+([,.]\d{2})?', word):
        return False  # currency values

    if re.match(r'^[A-Z]{1,2}\d{1,2}[A-Z]?\d[A-Z]{2}$', word, re.IGNORECASE):
        return False  # one-part postal code like "EC1A1BB"

    if re.match(r'^0\d{3}[-\s]?\d{6,7}$', word):
        return False  # UK phone numbers

    if re.match(r'^\d{1,2}(am|pm|AM|PM)$', word):
        return False  # time formats

    digit_count = sum(c.isdigit() for c in word)
    letter_count = sum(c.isalpha() for c in word)
    if digit_count >= letter_count:
        return False  

    return True


In [65]:
def process_row(row):
    """
    For each row:
    - Identify postal codes across two-word pairs
    - Skip substitution for postal codes, time, phone, etc.
    - Apply substitution safely
    """
    text = str(row['spacing_ocr_para_text'])
    words = text.split()
    detected = row['detected_digit_words']

    # Find two-word UK postal codes
    postal_code_indices = set()
    for i in range(len(words) - 1):
        candidate = f"{words[i]} {words[i+1]}"
        if re.match(r'^[A-Z]{1,2}\d{1,2}[A-Z]?\s\d[A-Z]{2}$', candidate, re.IGNORECASE):
            postal_code_indices.update({i, i+1})

    substituted_words = []
    for i, word in enumerate(words):
        if word not in detected:
            substituted_words.append(word)
            continue

        # Skip known 2-word postal code parts
        if i in postal_code_indices:
            substituted_words.append(word)
            continue

        if should_substitute(word):
            corrected = substitute_ocr_digits(word)
            substituted_words.append(corrected)
            text = text.replace(word, corrected)
        else:
            substituted_words.append(word)

    return pd.Series([substituted_words, text])


# Find digit-containing words
df['detected_digit_words'] = df['spacing_ocr_para_text'].apply(find_digit_words)

# Apply digit-to-letter correction safely
df[['substituted_words', 'digit_cleaned_ocr_para_text']] = df.apply(process_row, axis=1)

### Date Format Correction

In [66]:
def fix_dates(text):
    """
    Corrects common OCR errors in dates by substituting misrecognized digits.
    
    Args:
        text (str): Input text containing dates with potential OCR errors
        
    Returns:
        str: Text with OCR errors in dates corrected
    """
    # OCR digit substitutions (lowercase for lowercase OCR output)
    substitutions = {'O': '0', 'l': '1', 'i': '1', 'Z': '2', 'S': '5', 'B': '8'}

    # Date pattern: match digit-looking parts with possible OCR substitutions
    date_patterns = [
        r'\b\d{2}[-/]\d{2}[-/][\da-z]{4}\b',                   # 04/03/202s 
        r'\b[\da-z]{4}[-/]\d{2}[-/]\d{2}\b',                   # 202s/03/01
        r'\b\d{1,2}[-/ ]?[a-z]{3,9}[-/ ]?[\da-z]{2,4}\b',      # 01-mar-202s
        r'\b[a-z]{3,9}[-/ ]?\d{1,2}[-/ ]?[\da-z]{2,4}\b'       # march 1 202s
    ]
    combined_pattern = '|'.join(date_patterns)

    def substitute_date(match):
        return ''.join(substitutions.get(c, c) for c in match.group(0))

    return re.sub(combined_pattern, substitute_date, text, flags=re.IGNORECASE)

# Convert to lowercase before applying date fixes
#df['digit_cleaned_ocr_para_text'] = df['digit_cleaned_ocr_para_text'].str.lower()
df['digit_cleaned_ocr_para_text'] = df['digit_cleaned_ocr_para_text'].apply(fix_dates)


## Misspelling Detection

### Dictionary Based Spelling Check

In [67]:
def check_misspellings_symspell(text):
    """
    Detect misspelled words in OCR text using pre-loaded SymSpell dictionary.
    
    Args:
        text (str): Input OCR text.
    
    Returns:
        list: List of misspelled words (not in dictionary or corrected by SymSpell).
    """
    # Convert text to lowercase before processing
    text = str(text).lower()
    
    # Extract words including contractions and possessives
    words = re.findall(r"\b[a-zA-Z]+(?:'[a-z]+)?\b", text)

    misspellings = []
    for word in words:
        # Skip checking contractions that are in our known list
        if word in ["can't", "won't", "don't", "couldn't", "shouldn't", 
                   "wouldn't", "needn't", "mustn't", "she'll", "we'll",
                   "he'll", "they'll", "i'll", "i'm"]:
            continue
            
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        if suggestions and suggestions[0].term != word:
            misspellings.append(word)
    return misspellings

# Convert digit_cleaned_ocr_para_text to lowercase and detect misspelled words in each row
df['misspelled_words'] = df['digit_cleaned_ocr_para_text'].str.lower().apply(check_misspellings_symspell)

# View rows with misspellings
misspelled_df = df[df['misspelled_words'].map(len) > 0]


### NER Based Spelling Detection

In [68]:
def find_unknown_words_with_ner(text, frequency_dict_path):
    """
    Use NER to identify unknown words and classify them as entities or truly unknown.
    """
    
    # Load models
    ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")
    
    # Load frequency dictionary
    known_words = set()
    with open(frequency_dict_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 2:
                known_words.add(parts[0].lower())
    
    # Extract all words from text
    words = text.lower().split()
    clean_words = [word.strip('.,!?;:"()-[]{}') for word in words]
    
    # Find unknown words, excluding special patterns
    unknown_words = []
    for word in clean_words:
        if not word or word in known_words:
            continue
            
        # Skip words matching special patterns
        if len(word) < 3:
            continue
        if re.fullmatch(r'\d+', word):
            continue
        # Skip currency amounts with commas and decimals
        if re.fullmatch(r'[$£€]?\d{1,3}(,\d{3})*([.]\d{2})?', word):
            continue
        # Skip phone numbers with + prefix
        if re.match(r'^\+\d{2,}$', word):
            continue
        # Skip dates in format dd/mm/yyyy
        if re.match(r'^\d{2}/\d{2}/\d{4}$', word):
            continue
        # Skip formatted numbers like 1452,000.00 and 1999.00
        if re.match(r'^\d+([,]\d{3})*([.]\d{2})?$', word):
            continue
        if re.match(r'^[A-Z]{1,2}\d{1,2}[A-Z]?\d[A-Z]{2}$', word, re.IGNORECASE):
            continue
        if re.match(r'^0\d{3}[-\s]?\d{6,7}$', word):
            continue
        # Skip Time format like 9am 
        if re.match(r'^\d{1,2}(am|pm|AM|PM)$', word):
            continue
        # Skip reference numbers like b00123456789-998877665544
        if re.match(r'^[a-z]\d{11,}[-]\d+$', word, re.IGNORECASE):
            continue
        # Skip short alphanumeric codes like 7rv, eh3
        if re.match(r'^(\d[a-z]{2}|[a-z]{2}\d)$', word, re.IGNORECASE):
            continue
            
        unknown_words.append(word)
    
    if not unknown_words:
        print("All words found in dictionary!")
        return [], []
    
    print(f"Found {len(unknown_words)} unknown words")
    
    # Use NER to classify unknown words
    ner_results = ner_pipeline(text)
    
    # Extract entities
    entities = set()
    for entity in ner_results:
        entity_text = entity['word'].replace('##', '').lower()
        entities.add(entity_text)
    
    # Also use spaCy NER
    doc = nlp(text)
    for ent in doc.ents:
        entities.add(ent.text.lower())
    
    # Classify unknown words
    likely_entities = [word for word in unknown_words if word in entities]
    truly_unknown = [word for word in unknown_words if word not in entities]
    
    print(f"\nClassification:")
    print(f"  Likely entities (valid): {len(likely_entities)}")
    print(f"  Truly unknown (may need correction): {len(truly_unknown)}")
    
    if likely_entities:
        print(f"\nLikely entities: {likely_entities}")
    if truly_unknown:
        print(f"Truly unknown: {truly_unknown}")
    
    return likely_entities, truly_unknown


In [69]:
def check_cleaned_text_for_unknown_words(df, frequency_dict_path):
    """
    Check cleaned_text column for unknown words using NER.
    """
    # Combine all text from cleaned_text column
    all_text = ' '.join(df['digit_cleaned_ocr_para_text'].astype(str))
    
    print("Analyzing cleaned_text column for unknown words:")
    print(f"Total text length: {len(all_text)} characters")
    
    entities, unknown = find_unknown_words_with_ner(all_text, frequency_dict_path)
    
    return entities, unknown

# Run NER on cleaned_text column
entities, unknown_words = check_cleaned_text_for_unknown_words(df, dictionary_path)

# Add entities to protected words
if entities:
    print(f"\nPotential protected_words: {entities}")
    
# Focus spell checking on truly unknown words
if unknown_words:
    print(f"\nMisspelt words: {unknown_words}")

Analyzing cleaned_text column for unknown words:
Total text length: 19247 characters


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Found 46 unknown words

Classification:
  Likely entities (valid): 11
  Truly unknown (may need correction): 35

Likely entities: ['yeu', 'hsbe', 'hsbe', 'hsbe', '6.7%', '4.16%', '6.99%', '5.75%', '8.6%', '8.24%', '10%']
Truly unknown: ['eh99', 'bya', "we're", 'basls', 'ofler', 'ofler', 'cond1t1on', 'cond1t1on', 'ofler', 'isa', 'todo', 'ca11', "fca's", "we're", 'hclp', "you'd", 'te1ephone', 'te1ephone', 'wa1ker', 'yoars', 'w1th', 'unt11', 'eh99', 'apre', 'unt11', 'w111', 'andnumber', 'apre', 'unt11', 'yor', 'cemplaints', 'te1ephone', 'non-compliance', 'ofler', 'goed']

Potential protected_words: ['yeu', 'hsbe', 'hsbe', 'hsbe', '6.7%', '4.16%', '6.99%', '5.75%', '8.6%', '8.24%', '10%']

Misspelt words: ['eh99', 'bya', "we're", 'basls', 'ofler', 'ofler', 'cond1t1on', 'cond1t1on', 'ofler', 'isa', 'todo', 'ca11', "fca's", "we're", 'hclp', "you'd", 'te1ephone', 'te1ephone', 'wa1ker', 'yoars', 'w1th', 'unt11', 'eh99', 'apre', 'unt11', 'w111', 'andnumber', 'apre', 'unt11', 'yor', 'cemplaints'

In [70]:
def clean_ocr_digits_in_unknown(text, unknown_words):
    """Replace common OCR digit substitutions with their letter equivalents only for unknown words"""
    cleaned_text = text
    for word in unknown_words:
        cleaned_word = word
        for digit, letter in ocr_substitutions.items():
            cleaned_word = cleaned_word.replace(digit, letter)
        if cleaned_word != word:
            cleaned_text = cleaned_text.replace(word, cleaned_word)
    return cleaned_text

ocr_substitutions = {
    '0': 'o',  
    '1': 'l',  
    '3': 'e',  
    '5': 'S',  
    '6': 'g',  
    '8': 'b',  
    '9': 'g'   
}

# Clean OCR digits only in unknown words
df['digit_cleaned_ocr_para_text'] = df['digit_cleaned_ocr_para_text'].apply(
    lambda x: clean_ocr_digits_in_unknown(x, unknown_words)
)

## Spelling Correction

 The correction process is context-aware, preserving special terms like numbers, codes and protected words while fixing common OCR spelling errors.


In [71]:
"""Set up SymSpell spell checker with configuration parameters"""
# Load dictionary
if os.path.exists(dictionary_path):
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
else:
    raise FileNotFoundError("Dictionary file not found")

def should_skip_correction(word):
    """
    Check if word should be excluded from spelling correction.
    """
    word = word.strip()

    if len(word) < 3:
        return True

    if re.fullmatch(r'\d+', word):
        return True  # pure numbers

    if re.fullmatch(r'[$£€]?\d+([,.]\d{2})?', word):
        return True  # currency values
        
    if re.match(r'^\d+(\.\d+)?%$', word):
        return True  # percentage values like 6.7%

    if re.match(r'^[A-Z]{1,2}\d{1,2}[A-Z]?\d[A-Z]{2}$', word, re.IGNORECASE):
        return True  # one-part postal code like "EC1A1BB"

    if re.match(r'^0\d{3}[-\s]?\d{6,7}$', word):
        return True  # UK phone numbers

    if re.match(r'^\d{1,2}(am|pm|AM|PM)$', word):
        return True  # time formats

    return False

def correct_with_symspell(word: str) -> str:
    """
    Corrects spelling of a single word using SymSpell.
    
    Args:
        word (str): Word to be spell-checked
        
    Returns:
        str: Spell-corrected word or original word if no correction found
    """
    if should_skip_correction(word):
        return word

    suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
    if suggestions:
        return suggestions[0].term
    return word

def correct_misspelled_words(text: str, misspelled_words: list) -> str:
    """
    Corrects only the misspelled words in the text.
    
    Args:
        text (str): Original text
        misspelled_words (list): List of words to correct
        
    Returns:
        str: Text with misspelled words corrected
    """
    if not misspelled_words:
        return text
        
    words = text.split()
    corrected_words = []
    
    for word in words:
        word_lower = word.lower()
        if word_lower in misspelled_words:
            corrected = correct_with_symspell(word_lower)
            corrected_words.append(corrected)
        else:
            corrected_words.append(word)
            
    return ' '.join(corrected_words)

# Apply correction only to misspelled words
df["cleaned_text"] = df.apply(
    lambda row: correct_misspelled_words(
        row["digit_cleaned_ocr_para_text"], 
        row["misspelled_words"]
    ),
    axis=1
)

display_cols = ['row_id', 'ocr_para_text','cleaned_text']
df_cleaned = df[display_cols]

In [72]:
df_cleaned


Unnamed: 0,row_id,ocr_para_text,cleaned_text
0,1,Your Mortgage Offer,Your Mortgage Offer
1,2,Mr Mickey Mouse and Ms Minnie M0use,Mr Mickey Mouse and Ms Minnie Mouse
2,3,04/03/202S,04/03/2025
3,4,B00123456789-998877665544,B00123456789-998877665544
4,5,Thismortgage will be secured by a first legal charge over the property below:,This mortgage will be secured by a first legal charge over the property below
5,6,10 Some Street Edinburgh Midlothian 5cotland EH99 7RV,10 Some Street Edinburgh Midlothian scotland EH99 7RV
6,7,"If theproperty is in Scotland, the mortgage will be secured bya first ranking Standard Security.",If the property is in Scotland the mortgage will be secured by first ranking Standard Security
7,8,About this offer,About this offer
8,9,Thanks for choosing HSBC UK Bank plc for your mortgage. We're pleased to make this binding offer of a mortgage. This offer incorporates the information set out in the Mortgage Illustration shown in later pages and the enclosed HSBC Mortgage Loan Terms and Conditions 2023 v1 Edition (the 'Conditions'). These documents form part of your legal agreement with us (the 'Agreement').,Thanks for choosing HSBC UK Bank plc for your mortgage were pleased to make this binding offer of a mortgage This offer incorporates the information set out in the Mortgage Illustration shown in later pages and the enclosed HSBC Mortgage Loan Terms and Conditions 2023 v1 Edition (the conditions ms These documents form part of your legal agreement with us (the agreement ms
9,10,"You shouldreadthis offer, and the documents enclosed with it, carefully before accepting it. If yeu do not understand any part of it, please contact us.","You should read this offer and the documents enclosed with it, carefully before accepting it. If you do not understand any part of it, please contact us."


## Confidence Score

In [73]:
def calculate_confidence_score(original_text: str, cleaned_text: str) -> float:
    """
    Calculate confidence score between original and cleaned text using multiple metrics
    
    Args:
        original_text (str): Original OCR text
        cleaned_text (str): Cleaned/corrected text
        
    Returns:
        float: Confidence score between 0 and 1
    """
    # Sequence matcher ratio (0-1 score)
    sequence_score = SequenceMatcher(None, original_text, cleaned_text).ratio()
    
    # Word length similarity ratio
    orig_words = len(original_text.split())
    cleaned_words = len(cleaned_text.split())
    length_ratio = min(orig_words, cleaned_words) / max(orig_words, cleaned_words)
    
    # Character count similarity ratio
    orig_chars = len(original_text)
    cleaned_chars = len(cleaned_text)
    char_ratio = min(orig_chars, cleaned_chars) / max(orig_chars, cleaned_chars)
    
    # Combine scores with weights
    confidence = (0.5 * sequence_score) + (0.25 * length_ratio) + (0.25 * char_ratio)
    
    return round(confidence, 3)

# Calculate confidence scores
df['confidence_score'] = df.apply(
    lambda row: calculate_confidence_score(
        row['ocr_para_text'],
        row['cleaned_text']
    ),
    axis=1
)

# Display results with confidence scores
display_cols = ['row_id', 'ocr_para_text', 'cleaned_text', 'confidence_score']
df_confidence = df[display_cols]
df_confidence.head()


Unnamed: 0,row_id,ocr_para_text,cleaned_text,confidence_score
0,1,Your Mortgage Offer,Your Mortgage Offer,0.909
1,2,Mr Mickey Mouse and Ms Minnie M0use,Mr Mickey Mouse and Ms Minnie Mouse,0.986
2,3,04/03/202S,04/03/2025,0.95
3,4,B00123456789-998877665544,B00123456789-998877665544,1.0
4,5,Thismortgage will be secured by a first legal charge over the property below:,This mortgage will be secured by a first legal charge over the property below,0.928


In [74]:
# Calculate overall validation metrics
total_docs = len(df)
high_confidence_docs = len(df[df['confidence_score'] >= 0.9])
medium_confidence_docs = len(df[df['confidence_score'].between(0.7, 0.9)])
low_confidence_docs = len(df[df['confidence_score'] < 0.7])

# Calculate percentages
high_conf_pct = (high_confidence_docs / total_docs) * 100
med_conf_pct = (medium_confidence_docs / total_docs) * 100 
low_conf_pct = (low_confidence_docs / total_docs) * 100

# Calculate average confidence score
avg_confidence = df['confidence_score'].mean()

print("Validation Metrics Summary:")
print(f"Total Documents Processed: {total_docs}")
print(f"Average Confidence Score: {avg_confidence:.2%}")
print("\nConfidence Level Distribution:")
print(f"High Confidence (>= 0.9): {high_confidence_docs} docs ({high_conf_pct:.1f}%)")
print(f"Medium Confidence (0.7-0.9): {medium_confidence_docs} docs ({med_conf_pct:.1f}%)")
print(f"Low Confidence (< 0.7): {low_confidence_docs} docs ({low_conf_pct:.1f}%)")

Validation Metrics Summary:
Total Documents Processed: 164
Average Confidence Score: 95.53%

Confidence Level Distribution:
High Confidence (>= 0.9): 147 docs (89.6%)
Medium Confidence (0.7-0.9): 10 docs (6.1%)
Low Confidence (< 0.7): 7 docs (4.3%)
