In [1]:
import os
import glob
import sys
import math
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from pagexml.parser import parse_pagexml_file
from pathlib import Path
from termcolor import colored 
import re
import numpy as np 
import json
import html
import csv
from collections import defaultdict

## XML extraction
Just give the folder that contains all xml files and we get all texts back in dictionary format

In [None]:

def extract_texts_from_pagexml(input_dir):
    
    input_dir = Path(input_dir)

    xml_files = list(input_dir.glob("*.xml"))
    print(f"Found {len(xml_files)} XML files.")

    extracted_texts = {}

    for pagexml_file in xml_files:
        try:
            print(f"Processing: {pagexml_file}")
            filename = pagexml_file.stem  

            page_doc = parse_pagexml_file(pagexml_file)  
            text = " ".join([line.text for i in page_doc.text_regions for line in i.lines if line.text is not None ])
            extracted_texts[filename] = text

        except Exception as e:
            print(f"Failed: {pagexml_file}: {e}")

    return extracted_texts
text = extract_texts_from_pagexml("../data/d2_0001-0100_without_marginalia")
print(f"Extracted {text} texts from PageXML files.")

Found 24 XML files.
Processing: ..\..\..\new_data\173736378X_00000051.xml
Processing: ..\..\..\new_data\173736378X_00000100.xml
Processing: ..\..\..\new_data\173736378X_00000112.xml
Processing: ..\..\..\new_data\173736378X_00000120.xml
Processing: ..\..\..\new_data\173736378X_00000128.xml
Processing: ..\..\..\new_data\173738812X_00000025.xml
Processing: ..\..\..\new_data\173738812X_00000096.xml
Processing: ..\..\..\new_data\173738812X_00000109.xml
Processing: ..\..\..\new_data\173738812X_00000153.xml
Processing: ..\..\..\new_data\173738812X_00000157.xml
Processing: ..\..\..\new_data\173739250X_00000062.xml
Processing: ..\..\..\new_data\173739250X_00000128.xml
Processing: ..\..\..\new_data\173739250X_00000133.xml
Processing: ..\..\..\new_data\173739250X_00000253.xml
Processing: ..\..\..\new_data\173739250X_00000265.xml
Processing: ..\..\..\new_data\173857282X_00000027.xml
Processing: ..\..\..\new_data\173857282X_00000083.xml
Processing: ..\..\..\new_data\173857282X_00000179.xml
Processi

In [None]:
MODEL_ID = "meta-llama/Llama-3.2-1B"
# these are tresholds again I defined based on my observation
CRITICAL_LL_THRESHOLD = -10.0 # highly problematic
LOW_LL_THRESHOLD = -6.0       
VISUALIZATION_THRESHOLD_RED = -5.5
VISUALIZATION_THRESHOLD_YELLOW = -3.5
MODEL_MAX_LENGTH = 1024

CRITICAL_LL_THRESHOLD_WORD = -10.0   # word-level colouring
VIS_THRESHOLD_RED_WORD = -5.5
VIS_THRESHOLD_YELLOW_WORD = -3.5

OCR_ERROR_THRESHOLD = -4.5        

 ## Load pretrained model 


In [4]:
print(f"Loading model and tokenizer: {MODEL_ID}...")
try:
   tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
   model = AutoModelForCausalLM.from_pretrained(MODEL_ID)

   model.eval()
        
except Exception as e:
    print(f"Error loading model/tokenizer: {e}")
    sys.exit(1)


Loading model and tokenizer: meta-llama/Llama-3.2-1B...


In [None]:
# just to test if the model knows these characters already
print("Tokenizer Test:")

test_chars = ["ü", "ö", "ä", "ß", "Ġ","ſ", "ʒ"]

for char in test_chars:
    tokens = tokenizer.tokenize(char)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    print(f"Char: '{char}' -> Tokens: {tokens} -> IDs: {token_ids} -> Decoded: {tokenizer.decode(token_ids)}")

print(f"Tokenizing 'über': {tokenizer.tokenize('über')}")
print(f"Tokenizing 'schön': {tokenizer.tokenize('schön')}")
print(f"Tokenizing 'spät': {tokenizer.tokenize('spät')}")
print(f"Tokenizing 'Fuß': {tokenizer.tokenize('Fuß')}")
print(f"Tokenizing 'somit': {tokenizer.tokenize('somit')}")
print(f"Tokenizing 'zur': {tokenizer.tokenize('zur')}")

Tokenizer Test:
Char: 'ü' -> Tokens: ['Ã¼'] -> IDs: [2448] -> Decoded: ü
Char: 'ö' -> Tokens: ['Ã¶'] -> IDs: [3029] -> Decoded: ö
Char: 'ä' -> Tokens: ['Ã¤'] -> IDs: [2357] -> Decoded: ä
Char: 'ß' -> Tokens: ['ÃŁ'] -> IDs: [8156] -> Decoded: ß
Char: 'Ġ' -> Tokens: ['Äł'] -> IDs: [119741] -> Decoded: Ġ
Char: 'ſ' -> Tokens: ['Å', '¿'] -> IDs: [129, 123] -> Decoded: ſ
Char: 'ʒ' -> Tokens: ['Ê', 'Ĵ'] -> IDs: [134, 240] -> Decoded: ʒ
Tokenizing 'über': ['Ã¼ber']
Tokenizing 'schön': ['sch', 'Ã¶n']
Tokenizing 'spät': ['sp', 'Ã¤t']
Tokenizing 'Fuß': ['Fu', 'ÃŁ']
Tokenizing 'somit': ['som', 'it']
Tokenizing 'zur': ['z', 'ur']


## Perplexity and Log-Likelihood Calculation

This function computes either the perplexity or the mean log-likelihood of a given text,
depending on the parameters passed. Since both metrics use similar computations 
(based on the model's loss),so to avoid  code duplication.

- To get the perplexity score, set `return_perplexity=True`.
- To get the mean log-likelihood and token count, set `return_tokens=True`.

In [6]:
def evaluate_text_likelihood(text: str, tokenizer, model, return_tokens=False, return_perplexity=False):
    
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=MODEL_MAX_LENGTH
    )
    input_ids = inputs.input_ids
    num_tokens = input_ids.shape[1]

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss

    if return_perplexity:
        try:
            return math.exp(loss)
        except OverflowError:
            return float('inf')
    elif return_tokens:
        return -loss, num_tokens
    else:
        return -loss

## Token-Level Log-Likelihood Functions

Here after I get the logits (scores for each token in the vocabulary), I calculated the log probabilities and mapped them to the actual tokens in the text.


In [7]:
def get_token_log_probs_with_tokens(text, tokenizer, model):
    
    if not text.strip(): return []

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MODEL_MAX_LENGTH, return_offsets_mapping=True)
    input_ids = inputs.input_ids    

    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        # Raw predictions before softmax, so these are not probabilities yet
        logits = outputs.logits  

        input_token_ids = inputs.input_ids[0]  # [0] to get the sequence tensor without the batch dimension
        # If text = "The cat sat"
        # token_ids = [101, 2001, 4937, 2938, 102]  -> [CLS] The cat sat [SEP]

        # log_softmax to convert logits into log-probabilities over the vocabulary
        # We exclude the last position, as it has no "next token" to predict
        # After we got the predictions, we need to select the log-prob for the correct next token
        # at each position using gather
        log_probs_all_vocab = torch.nn.functional.log_softmax(logits[0, :-1, :], dim=-1)

        # Shift the input tokens to get the correct "target" tokens (the actual next tokens)
        # If input = [The, cat, sat, .], then target = [cat, sat, .]
        target_ids = input_token_ids[1:]  # These are the tokens we're checking likelihood for

        # For each prediction step, we select the log-probability the model assigned to the actual next token
        # gather selects the log-prob at the correct index in vocab for each timestep
        selected_log_probs = log_probs_all_vocab.gather(1, target_ids.unsqueeze(-1)).squeeze(-1).tolist()

    # Get all tokens in form (e.g., ["The", "Ġcat", "Ġsat", "."])
    all_tokens_str = tokenizer.convert_ids_to_tokens(input_token_ids)
    offset_mapping = inputs.offset_mapping[0].tolist()  # getting the offset mapping for the turning into original text

    # Pair each token with its corresponding log-probability
    # The log-prob at index j corresponds to token[j+1], because prediction for token[1] comes from input token[0]
    token_data = []
    for i, logp in enumerate(selected_log_probs):
        token_index_in_text = i + 1  # Skip the first token since it's not predicted
        token_str = all_tokens_str[token_index_in_text]
        token_data.append((token_str, logp, token_index_in_text))

    return token_data, all_tokens_str, offset_mapping

## Word-Level Log-Likelihood 


In [None]:
def get_word_log_probs(text: str,
                       tokenizer,
                       model,
                       *,
                       return_token_level: bool = False,
                       return_offsets: bool = False,
                       model_max_len: int = 1024):

    inputs = tokenizer(text,
                    return_tensors="pt",
                    truncation=True,
                    max_length=model_max_len,
                    return_offsets_mapping=True)

    with torch.no_grad():
        logits = model(inputs.input_ids).logits           
    logp_all = torch.nn.functional.log_softmax(logits, dim=-1)

    # shift so position i predicts token i (same operation)
    target_ids = inputs.input_ids[:, 1:]               
    context_logp = logp_all[:, :-1, :]               
    logp_sel = context_logp.gather(2, target_ids.unsqueeze(-1)).squeeze() 

    # I need to keep track of the offsets to map tokens back to original text
    # and to group them into words
    offsets = inputs.offset_mapping[0].tolist()
    tok_recs = []
    for idx, ((start, end), tid, lp) in enumerate(
            zip(offsets[1:], target_ids[0], logp_sel)):
        tok_recs.append({
            "idx":  idx+1,                               
            "tok":  tokenizer.convert_ids_to_tokens(int(tid)),
            "s":    int(start),
            "e":    int(end),
            "lp":   float(lp)
        })
    # Group tokens into words, taking into account the special "Ġ" marker as space
    # and punctuations are kept separate 
    words, cur = [], []
    for t in tok_recs:
        start_new = False

        # token string starts with the "space" marker "Ġ"
        if t["tok"].startswith("Ġ"):
            start_new = True

        # token is standalone punctuation
        if t["tok"] in {".", ",", ";", ":", "!", "?", "(", ")", "[", "]",
                        "{", "}", "\"", "'", "«", "»", "„", "“"}:
            start_new = True
        # new word starts, words list of list
        if start_new and cur:
            words.append(cur)
            cur = [t]
        else:
            cur.append(t)

    if cur:
        words.append(cur)

    # here assemble word list
    word_log_list = []
    for w in words:
        # start and end indices in the normalized text
        s_norm, e_norm = w[0]["s"], w[-1]["e"]-1
        w_text  = text[s_norm:e_norm+1]                
        w_lp    = float(np.mean([t["lp"] for t in w])) # word average log-probability
        word_log_list.append((w_text, w_lp, (s_norm, e_norm)))

    outs = [word_log_list]

    if return_token_level:
        outs.append([(t["tok"], t["lp"], t["idx"]) for t in tok_recs])

    if return_offsets:
        outs.extend([
            [(t["s"], t["e"]) for t in tok_recs],
            [t["tok"] for t in tok_recs]
        ])

    return tuple(outs) if len(outs) > 1 else outs[0]

# Problematic Tokens

Here I wanted to show the problematic tokens (under the low log-likelihood threshold) and not only the problematic token but also the context so I used context window to show surroindig words 
    

In [None]:
def analyze_problematic_tokens(text_tokens_with_logprobs, all_tokens_str, context_window=3):
    
    problematic_tokens_details = []
    if not text_tokens_with_logprobs:
        print(" No token probabilities to analyze.")
        return problematic_tokens_details

    print(f"\n Problematic Tokens (LogProb < {LOW_LL_THRESHOLD:.1f}) with Context:")
    found_any = False
    for token_str, logp, token_idx in text_tokens_with_logprobs:
        if logp < LOW_LL_THRESHOLD:
            found_any = True
            # to make it easier to follow the context I used context window of 3 tokens before and after the token
            start_ctx = max(0, token_idx - context_window)
            end_ctx = min(len(all_tokens_str), token_idx + context_window + 1)

            context_display_list = []
            for i in range(start_ctx, end_ctx):
                display_tok = all_tokens_str[i].replace("Ġ", "")
                if i == token_idx:
                    context_display_list.append(colored(f"-->{display_tok}<--", "red", attrs=["bold"]))
                else:
                    context_display_list.append(display_tok)
            context_str_formatted = " ".join(context_display_list)

            clean_token_str = token_str.replace("Ġ", "")
            print(f"  - Token: '{colored(clean_token_str, 'magenta')}' (LogProb: {colored(f'{logp:.2f}', 'cyan')}) | Context: ... {context_str_formatted} ...")
            problematic_tokens_details.append((clean_token_str, logp, " ".join(all_tokens_str[start_ctx:end_ctx]).replace("Ġ","")))

    if not found_any:
        print(f"  No tokens found with LogProb < {LOW_LL_THRESHOLD:.1f}.")
    return problematic_tokens_details


# Visualization to display token likelihoods

I did some preprocessing like removing "Ġ" which is used by tokenizers to indicate spaces and also some skipping of empty tokens etc.


In [None]:
def visualize_token_likelihoods(all_tokens_str, text_tokens_with_logprobs):
    if not all_tokens_str:
        print("\n  No tokens for visualization.")
        return

    print("\n Visualized Token Likelihoods (Colors: Green > Yellow > Red [problematic]):")

    # Create a dictionary for quick log_prob lookup by token index
    log_probs_map = {idx: lp for _, lp, idx in text_tokens_with_logprobs}

    output_line = []
    for i, token_str in enumerate(all_tokens_str):
        clean_token = token_str.replace("Ġ", "") # "Ġ" is used by tokenizers for space
        if not clean_token.strip() and clean_token != " ": 
             if clean_token == " " and i > 0 and not output_line[-1].endswith(" "): 
                output_line.append(" ")
             continue

        logp = log_probs_map.get(i, None) 

        if logp is None and i == 0 :
            color = "white"
        elif logp is None: 
            color = "white"
        elif logp < VISUALIZATION_THRESHOLD_RED:
            color = "red"
        elif logp < VISUALIZATION_THRESHOLD_YELLOW:
            color = "yellow"
        else:
            color = "green"

        try:
            output_line.append(colored(clean_token, color))
        except ImportError: 
            output_line.append(clean_token)

    # This part is a bit hard because with subword tokenizers a simple join is may not be sufficent
    final_display = ""
    for i, item in enumerate(output_line):
        final_display += item
        if i < len(all_tokens_str) -1 and (all_tokens_str[i+1].startswith("Ġ") or not item.endswith(" ")):
             if not (all_tokens_str[i].endswith(("'", "-",'"')) or (i < len(all_tokens_str) -1 and all_tokens_str[i+1].startswith((".",",",";","!","?")))): 
                final_display += " "
        elif i < len(output_line) -1 and not item.endswith(" "): 
             final_display += " "


    print(final_display.strip())

# Historical character map 
Some models dont not know historical characters like "ſ" (long s) or "ß" (sharp s) so I need to map them to their modern equivalents so that we can get more accurate results


In [None]:
# "LSX-UniWue/LLaMmlein_1B" old german letters are not recognizable according to my observation for example, 
# Char: 'ſ' -> Tokens: ['Å', '¿'] -> IDs: [130, 126] -> Decoded: ſ 
# so they are not in the tokenizer vocabulary so I needed to normalize them
# and this mapping can be useful for other models thats why I wanted to add it
with open("historical_char_map.json", encoding="utf-8") as f:
    HISTORICAL_CHAR_MAP = json.load(f)

def normalize_text_and_get_char_mapping(original_text, char_map_dict):

    normalized_text_parts = []
    norm_char_idx_to_orig_char_idx = []  # Map: index in normalized_text -> index in original_text

    for orig_idx, orig_char in enumerate(original_text):
        replacement = char_map_dict.get(orig_char, orig_char) 
        
        normalized_text_parts.append(replacement)
        for _ in replacement: # For each character in the replacement string
            norm_char_idx_to_orig_char_idx.append(orig_idx)

    final_normalized_text = "".join(normalized_text_parts)
    return final_normalized_text, norm_char_idx_to_orig_char_idx

# HTML presenation

I thought saving as a html format can be useful for our gui

In [None]:
def generate_html(original_text, token_offsets, token_log_probs):
    html_out = ""
    last_pos = 0
    for (start, end), (_, logp, _) in zip(token_offsets, token_log_probs):
        if start is None or end is None:
            continue

        color = "green"
        if logp < CRITICAL_LL_THRESHOLD:
            color = "darkred"
        elif logp < VISUALIZATION_THRESHOLD_RED:
            color = "red"
        elif logp < VISUALIZATION_THRESHOLD_YELLOW:
            color = "orange"

        html_out += html.escape(original_text[last_pos:start])
        token_html = html.escape(original_text[start:end+1])
        html_out += f'<span style="color:{color}; font-weight:bold;" title="logp={logp:.2f}">{token_html}</span>'
        last_pos = end + 1

    html_out += html.escape(original_text[last_pos:])
    return html_out

In [13]:
def generate_html_words(original_text, word_log_list, norm2orig):
    out, last = [], 0
    for word, lp, (s_norm, e_norm) in word_log_list:
        s_orig, e_orig = norm2orig[s_norm], norm2orig[e_norm]
        out.append(html.escape(original_text[last:s_orig]))

        if lp < CRITICAL_LL_THRESHOLD_WORD:
            colour = "darkred"
        elif lp < VIS_THRESHOLD_RED_WORD:
            colour = "red"
        elif lp < VIS_THRESHOLD_YELLOW_WORD:
            colour = "orange"
        else:
            colour = "lightgreen"

        span = html.escape(original_text[s_orig:e_orig+1])
        out.append(f'<span style="color:{colour};font-weight:bold;" '
                   f'title="logp={lp:.2f}">{span}</span>')
        last = e_orig + 1
    out.append(html.escape(original_text[last:]))
    return "".join(out)


In [None]:
def get_page_readability_analytics(token_log_probs_list, perplexity, mean_overall_ll, num_total_tokens):

    analytics = {
        "Perplexity": perplexity,
        "MeanLogLikelihood_Overall": mean_overall_ll,
        "TotalTokens": num_total_tokens,
        "LowLikelihoodTokenCount": 0,
        "CriticalLikelihoodTokenCount": 0,
        "PercentageLowLikelihood": 0.0,
        "PercentageCriticalLikelihood": 0.0,
        "ReadabilityCategory": "N/A",
        "LogLikelihood_Min": None,
        "LogLikelihood_Median": None,
        "LogLikelihood_Max": None,
    }

    log_probs_values = [lp for _, lp, _ in token_log_probs_list]
    if not log_probs_values: # if only one token in text, token_log_probs_list is empty
        if num_total_tokens > 0: # If there was text but no subsequent tokens for log_probs
             analytics["ReadabilityCategory"] = "Very Short / Unanalyzable"
        return analytics


    analytics["LowLikelihoodTokenCount"] = sum(1 for lp in log_probs_values if lp < LOW_LL_THRESHOLD)
    analytics["CriticalLikelihoodTokenCount"] = sum(1 for lp in log_probs_values if lp < CRITICAL_LL_THRESHOLD)

    # num_total_tokens is from the input, token_log_probs_list is for tokens AFTER the first.
    analyzable_tokens_count = len(log_probs_values)
    if analyzable_tokens_count > 0:
        analytics["PercentageLowLikelihood"] = (analytics["LowLikelihoodTokenCount"] / analyzable_tokens_count) * 100
        analytics["PercentageCriticalLikelihood"] = (analytics["CriticalLikelihoodTokenCount"] / analyzable_tokens_count) * 100

    analytics["LogLikelihood_Min"] = np.min(log_probs_values)
    analytics["LogLikelihood_Median"] = np.median(log_probs_values)
    analytics["LogLikelihood_Max"] = np.max(log_probs_values)

    perc_low = analytics["PercentageLowLikelihood"]

    if perplexity < 150 and mean_overall_ll > -4.5 and perc_low < 3.0: 
        analytics["ReadabilityCategory"] = "Good"
    elif perplexity < 400 and mean_overall_ll > -6.0 and perc_low < 7.0:
        analytics["ReadabilityCategory"] = "Fair"
    elif perplexity < 1000 and mean_overall_ll > -8.0 and perc_low < 15.0:
        analytics["ReadabilityCategory"] = "Poor"
    else:
        analytics["ReadabilityCategory"] = "Very Poor / Problematic"

    if num_total_tokens < 10 : 
        if analytics["ReadabilityCategory"] in ["Good", "Fair"] and perc_low > 10: 
            analytics["ReadabilityCategory"] = "Poor (Short with errors)"
        elif num_total_tokens < 3:
             analytics["ReadabilityCategory"] = "Very Short / Unanalyzable"


    print("\n Page Readability Analytics:")
    print(f"  - Overall Perplexity: {analytics['Perplexity']:.2f}")
    print(f"  - Overall Mean Log-Likelihood: {analytics['MeanLogLikelihood_Overall']:.4f}")
    print(f"  - Total Tokens Processed: {analytics['TotalTokens']}")
    print(f"  - Readability Category: {colored(str(analytics['ReadabilityCategory']), 'blue', attrs=['bold'])}")
    print(f"  - Low Likelihood Tokens (< {LOW_LL_THRESHOLD:.1f}): {analytics['LowLikelihoodTokenCount']} ({analytics['PercentageLowLikelihood']:.2f}%)")
    print(f"  - Critical Likelihood Tokens (< {CRITICAL_LL_THRESHOLD:.1f}): {analytics['CriticalLikelihoodTokenCount']} ({analytics['PercentageCriticalLikelihood']:.2f}%)")
    if analytics["LogLikelihood_Min"] is not None:
        print(f"  - Token Log-Likelihood Stats: Min={analytics['LogLikelihood_Min']:.2f}, Median={analytics['LogLikelihood_Median']:.2f}, Max={analytics['LogLikelihood_Max']:.2f}")

    return analytics

## ERROR Collection 



In [None]:
def collect_ocr_errors(word_log_list,
                       norm2orig_map,
                       original_text: str,
                       *,
                       lp_threshold: float = -8.0,
                       context_window: int = 20,
                       outfile: str | None = None):

    rows = []
    for word, lp, (s_norm, e_norm) in word_log_list:
        if lp >= lp_threshold:
            continue

        s_orig = norm2orig_map[s_norm]
        e_orig = norm2orig_map[e_norm]

        ctx_start = max(0, s_orig - context_window)
        ctx_end   = min(len(original_text), e_orig + 1 + context_window)
        # Additional context around the word
        context   = original_text[ctx_start:s_orig] + \
                    " " + original_text[s_orig:e_orig+1] + " " + \
                    original_text[e_orig+1:ctx_end]

        rows.append({
            "word"            : original_text[s_orig:e_orig+1],
            "logP_mean"       : lp,
            "context"         : context,
            "start_char_orig" : s_orig,
            "end_char_orig"   : e_orig
        })

    #CSV
    if outfile is not None:
        fieldnames = ["word", "logP_mean", "context",
                      "start_char_orig", "end_char_orig"]
        with open(outfile, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            for r in rows:
                writer.writerow(r)

    return rows


# Main loop

Here I merged everything 

In [None]:
input_dir = "../data/d2_0001-0100_without_marginalia"

try:
    if not Path(input_dir).exists():
        raise FileNotFoundError("directory not found")
    texts_xml = extract_texts_from_pagexml(input_dir) 

except (FileNotFoundError, NameError, ValueError) as e:
    print(f"Error loading texts: {e}")

results_data = []
results_html = [] 
print(texts_xml)
for fname, original_page_text in texts_xml.items():
    

    try:
        # Normalize text for the model and get character mapping, I needed becasue later on I need to map the tokens back to the original text
        normalized_page_text, norm_char_to_orig_char_map = normalize_text_and_get_char_mapping(original_page_text, HISTORICAL_CHAR_MAP)
        print(norm_char_to_orig_char_map)
        # getting the likelihood of the normalized text
        ppl = evaluate_text_likelihood(normalized_page_text, tokenizer, model, return_perplexity=True)
        # getting the mean log-likelihood and number of tokens for the normalized text
        mean_ll_overall, num_tokens_overall = evaluate_text_likelihood(normalized_page_text, tokenizer, model, return_tokens=True)

        # Word level
        (word_log_probs,
         tok_log_probs,
         norm_char_offsets,
         all_tokens_norm) = get_word_log_probs(
                                normalized_page_text, tokenizer, model,
                                return_token_level=True, return_offsets=True)
        # Analyze and print problematic tokens, ,it uses normalized tokens for analysis
        problematic_tokens_summary = analyze_problematic_tokens(tok_log_probs, all_tokens_norm, context_window=3)

        html_text = generate_html_words(original_page_text, word_log_probs, norm_char_to_orig_char_map)

        page_analytics = get_page_readability_analytics(tok_log_probs, ppl, mean_ll_overall, num_tokens_overall)
        

        ocr_errors = collect_ocr_errors(
                word_log_probs,
                norm2orig_map=norm_char_to_orig_char_map,
                original_text=original_page_text,
                lp_threshold=-5.5,
                outfile=f"{fname}_ocr_errors.csv")  

        print(f" {len(ocr_errors)} suspect words written to {fname}_ocr_errors.csv")

        results_data.append({
            "Filename": fname,
            **page_analytics,
            "ProblematicTokensExamples": problematic_tokens_summary[:3]
        })
        results_html.append({"Filename": fname, "ColoredTextHTML": html_text})

    except Exception as e:
        print(f"  ERROR processing {fname}: {e}")
        import traceback
        traceback.print_exc()
        error_analytics = {k:None for k in get_page_readability_analytics([], float('nan'),float('nan'),0) if k not in ["Perplexity", "MeanLogLikelihood_Overall", "TotalTokens"]}


Found 24 XML files.
Processing: ..\..\..\new_data\173736378X_00000051.xml
Processing: ..\..\..\new_data\173736378X_00000100.xml
Processing: ..\..\..\new_data\173736378X_00000112.xml
Processing: ..\..\..\new_data\173736378X_00000120.xml
Processing: ..\..\..\new_data\173736378X_00000128.xml
Processing: ..\..\..\new_data\173738812X_00000025.xml
Processing: ..\..\..\new_data\173738812X_00000096.xml
Processing: ..\..\..\new_data\173738812X_00000109.xml
Processing: ..\..\..\new_data\173738812X_00000153.xml
Processing: ..\..\..\new_data\173738812X_00000157.xml
Processing: ..\..\..\new_data\173739250X_00000062.xml
Processing: ..\..\..\new_data\173739250X_00000128.xml
Processing: ..\..\..\new_data\173739250X_00000133.xml
Processing: ..\..\..\new_data\173739250X_00000253.xml
Processing: ..\..\..\new_data\173739250X_00000265.xml
Processing: ..\..\..\new_data\173857282X_00000027.xml
Processing: ..\..\..\new_data\173857282X_00000083.xml
Processing: ..\..\..\new_data\173857282X_00000179.xml
Processi

In [17]:
print(results_data)

[{'Filename': '173736378X_00000051', 'Perplexity': 23.57652744808353, 'MeanLogLikelihood_Overall': tensor(-3.1603), 'TotalTokens': 496, 'LowLikelihoodTokenCount': 81, 'CriticalLikelihoodTokenCount': 15, 'PercentageLowLikelihood': 16.363636363636363, 'PercentageCriticalLikelihood': 3.0303030303030303, 'ReadabilityCategory': 'Very Poor / Problematic', 'LogLikelihood_Min': np.float64(-16.389102935791016), 'LogLikelihood_Median': np.float64(-2.409608840942383), 'LogLikelihood_Max': np.float64(-0.0009196343016810715), 'ProblematicTokensExamples': [('die', -14.101268768310547, 'die Stadt ange legt wurde'), ('Stadt', -8.015586853027344, 'die Stadt ange legt wurde .'), ('ange', -8.62804889678955, 'die Stadt ange legt wurde . We')]}, {'Filename': '173736378X_00000100', 'Perplexity': 17.94972709356424, 'MeanLogLikelihood_Overall': tensor(-2.8876), 'TotalTokens': 711, 'LowLikelihoodTokenCount': 115, 'CriticalLikelihoodTokenCount': 18, 'PercentageLowLikelihood': 16.19718309859155, 'PercentageCriti

I wanted to save all the information that we get but result is tensor so I needed to convert it to suitable format 

In [18]:
def clean_for_json(obj):
    if isinstance(obj, torch.Tensor):
        return obj.item()
    if isinstance(obj, np.generic):
        return obj.item()
    if isinstance(obj, (list, tuple)):
        return [clean_for_json(x) for x in obj]
    if isinstance(obj, dict):
        return {k: clean_for_json(v) for k, v in obj.items()}
    return obj

#the cleaning 
cleaned_results = clean_for_json(results_data)
# Then write to file
with open("ocr_colored_results.json", "w", encoding="utf-8") as f:
    json.dump(results_html, f, indent=2, ensure_ascii=False)

with open("ocr_readability.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_results, f, indent=2, ensure_ascii=False)


print("Colored token results saved to ocr_colored_results.json")


Colored token results saved to ocr_colored_results.json


In [19]:
from IPython.display import display, HTML
with open("ocr_colored_results.json", encoding="utf-8") as f:
    data = json.load(f)
for entry in data:
    display(HTML(f"<h3>{entry['Filename']}</h3>" + entry["ColoredTextHTML"]))
