In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer
from src.llms_ocr.errors_calculator_chunks import ErrorsCalculator

#### Load the model

Can be replaced with any model.

In [None]:
lama_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
lama_tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

#### Load the data

Set the input directory with xml files and output directory for storing the results either as json or csv files.
The output file contains the list of words for given page with following information:
`word,line_id,paragraph_id,word_position,perplexity,is_error,page_id`

In [None]:
data_dir = Path("../../data/d2_0001-0100_without_marginalia")
out_dir = Path("../../data/json_csv")

#### Run analysis

Iterate over all xml files, process the page and save results in json and/or csv files.

In [None]:
for xml_file in data_dir.glob("*.xml"):
    page_base = xml_file.stem
    json_file = out_dir / f"{page_base}.json"
    csv_file = out_dir / f"{page_base}.csv"
    if json_file.exists() and csv_file.exists():
        print(f"Skipping {xml_file.name} because results already exist.")
        continue
    # Initialize the Score Calculator
    # chunk_size and overlap_size can be adjusted
    calculator = ErrorsCalculator(
        model=lama_model, tokenizer=lama_tokenizer, chunk_size=20, overlap_size=10
    )
    try:
        json_data, frame = calculator.process_page(Path(xml_file))
        # save the results as json
        calculator.save_json(
            data=json_data,
            out_dir=out_dir,
            file_name=f"{xml_file.stem}.json",
        )
        # save the results as csv
        calculator.save_dataframe(
            df=frame,
            out_dir=out_dir,
            file_name=f"{xml_file.stem}.csv",
        )
    except Exception as e:
        print(f"Error processing {xml_file}: {e}")
        continue