# per-line transcription with LLM & OCR

In [1]:
from openai import OpenAI
from anthropic import Anthropic
import os
import re
import pandas as pd
from dotenv import load_dotenv
import requests
import base64
import subprocess
from IPython.display import display, Image
from PIL import Image as PILImage

In [2]:
import cv2
import easyocr
import matplotlib.pyplot as plt
import pytesseract
import keras_ocr

In [3]:
path = os.path.dirname(os.getcwd()) # Parent directory
image_folder = path+'/data/lines'

In [34]:
# OpenAI
load_dotenv() #get the environment 
openai_API_KEY = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=openai_API_KEY)

In [35]:
anthropic_API_KEY = os.getenv("ANTHROPIC_API_KEY")
anthropic_client = Anthropic(api_key=anthropic_API_KEY)
MODEL_NAME = "claude-3-5-sonnet-20240620"

## Read and encode the images

In [6]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [7]:
images = []
for image in os.listdir(image_folder):
    if image.endswith('.jpg'):
        images.append(image)

rows = []
for image in images:
    name = image.split('.')[0]
    name_split = name.split('_')[0]
    file_name = name_split.split('example')[1]
    line_name = name.split('_')[1]
    encoded_value = encode_image(image_folder+'/'+image)
    rows.append({'file': file_name, 'line': line_name, 'encoded': encoded_value})

images_encoded = pd.DataFrame(rows)

In [8]:
images_encoded['file'] = images_encoded['file'].astype('int')
images_encoded['line'] = images_encoded['line'].astype('int')
images_encoded = images_encoded.sort_values(by=['file', 'line']).reset_index(drop=True)
images_encoded['id'] = images_encoded['file'].astype(str) + '_' + images_encoded['line'].astype(str)
images_encoded.head(30)

Unnamed: 0,file,line,encoded,id
0,1,0,/9j/4QczRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_0
1,1,1,/9j/4QZBRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_1
2,1,2,/9j/4QX5RXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_2
3,1,3,/9j/4QYvRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_3
4,1,4,/9j/4QZLRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_4
5,1,5,/9j/4QVaRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_5
6,1,6,/9j/4QVqRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_6
7,1,7,/9j/4QX5RXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_7
8,1,8,/9j/4QWRRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_8
9,1,9,/9j/4QZyRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_9


# LLMs

## General API Calls

In [86]:
def callOpenAI(prompt, max_tokens=800, base64_image=None):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_API_KEY}"
    } 
    model_vision = "gpt-4o"
    payload = {
        "model": model_vision, 
        "messages": [
        {
            "role": "system",
            "content": "You are a helpful assistant who can read old handwriting with a background in history, and you are going to recreate a scanned déclaration de succession from Belgium in a txt format."
            
        },
        {
            "role": "user",
            "content": [
            {
                "type": "text",
                "text": prompt
            },
            {
                "type": "image_url",
                "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
                }
            }
            ]
        }
        ],
        "max_tokens": max_tokens,
        "temperature": 0
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    try:
        return response.json()["choices"][0]["message"]["content"]
    except:
        print(response.json()["error"]["message"])

In [87]:
def callAnthropic(prompt, max_tokens=5000, base64_image=None):
    response = anthropic_client.messages.create(
        model=MODEL_NAME,
        max_tokens=max_tokens,
        system = "You are a helpful assistant who can read old handwriting with a background in history, and you are going to recreate a scanned déclaration de succession from Belgium in a txt format.",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "image", 
                        "source": {
                            "type": "base64", 
                            "media_type": "image/jpeg", 
                            "data": base64_image}},
                    {
                        "type": "text",
                        "text": prompt,
                    }
                ],
            }
        ],
        temperature=0,
    )
    return response.to_dict()["content"][0]["text"]

In [88]:
def callPostProcessing(max_tokens=800, prompt_parameter = None):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_API_KEY}"
    } 
    payload = {
        "model": "gpt-4o",
        "messages": [
        {
            "role": "user",
            "content": [
            {
                "type": "text",
                "text": f"""This is an output from you. Clean it such that we have no separators and no comment from you: {prompt_parameter}
                """
            }
            ]
        }
        ],
        "max_tokens": max_tokens,
        "temperature": 0
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    try:
        return response.json()["choices"][0]["message"]["content"]
    except:
        print(response.json()["error"]["message"])


In [89]:
# use this when OpenAI credits are exhausted
def callPostProcessing_anthropic(max_tokens=5000, prompt_parameter = None):
    response = anthropic_client.messages.create(
        model=MODEL_NAME,
        max_tokens=max_tokens,
        system = "You are a helpful assistant who can read old handwriting with a background in history, and you are going to recreate a scanned déclaration de succession from Belgium in a txt format.",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f"""This is an output from you. Clean it such that we have no separators and no comment from you: {prompt_parameter}
                """
                    }
                ],
            }
        ],
        temperature=0,
    )
    return response.to_dict()["content"][0]["text"]

### Zero-shot

In [85]:
prompt = """
    Recognize the text from the image:
    ```plaintext
"""


prompt_complex = """
    Context:
        It's an old Belgian document. And you're getting one row of a table from it. It's written in French language and the names of the people are domiciles are Belgian.

    Structure:
        The table is structured with the two-level headers as follows:
        [("N' d'ordre", " "),
                ("Date du dépot des déclarations", " "),
                ("Désignation des personnes décédées ou absentes.:", "Nom."),
                ("Désignation des personnes décédées ou absentes.:", "Prénoms"),
                ("Désignation des personnes décédées ou absentes.:", "Domiciles"), 
                ("Date du décès ou du judgement d'envoi en possession, en cas d'absence.", " "),
                ("Noms, Prénoms et demeures des parties déclarantes.", " "),
                ("Droits de succession en ligne collatérale et de mutation en ligne directe.", "Actif. (2)"),
                ("Droits de succession en ligne collatérale et de mutation en ligne directe.", "Passif. (2)"),
                ("Droits de succession en ligne collatérale et de mutation en ligne directe.", "Restant NET. (2)"),
                ("Droit de mutation par décès", "Valeur des immeubles. (2)"), 
                ("Numéros des déclarations", "Primitives."),
                ("Numéros des déclarations", "Supplémentaires."), 
                ("Date", "de l'expiration du délai de rectification."),
                ("Date", "de l'exigibilité des droits."),
                ("Numéros de la consignation des droits au sommier n' 28", " "),
                ("Recette des droits et amendes.", "Date"),
                ("Recette des droits et amendes.", "N^03"),
                ("Cautionnements. ", "Numéros de la consignation au sommier n'30"),
                ("Observations (les déclarations qui figurent à l'état n'413 doivent être émargées en conséquence, dans la présente colonne.)", " ")] 

        Some image (hence, some rows) may start with "Arrêté le \d{2} \w+ \d{4}( \w+)? servais" or contain notes.

    Task:
        Recognize the text from the image. Pay attention to reading each word and number correctly. Return the text as you read it and you must read the text from the image since the image contains texts.
    ```plaintext 
"""

  prompt_complex = """


In [70]:
header_ids = ['1_0', '2_0', '3_0', '4_0', '5_0', '6_0', '7_0', '8_0', '9_0', '10_0',
              '11_0', '12_0', '13_0', '14_0', '15_0', '16_0', '17_0', '18_0', '19_0', '20_0']
typo_ids = ['4_1', '4_7', '8_2', '8_5', '8_10', '13_9', '16_2']

In [114]:
unable_ids

['18_0', '19_0']

In [115]:
import time
import json

# Load previous progress if available
try:
    with open('claude_complex_output_progress.json', 'r') as file:
        claude_complex_output = json.load(file)
except FileNotFoundError:
    claude_complex_output = {}

# Loop through each unique id
# for id in images_encoded['id'].unique():
# for id in header_ids+typo_ids:
for id in unable_ids:
    # Check if this ID is already processed (Skip this step if you want to re-process for unable_ids) ----------------
    # if id in claude_complex_output:
    #     print(f"Skipping {id}, already processed.")
    #     continue
    
    start_time = time.time()
    try:
        print(f'------- Start processing file {id} -------')
        
        # Call OpenAI/Anthropic and post-processing functions
        prompt_complex += "Even if it is hard to read the texts from the image, return as much as you can. You must read something. Do not return an apologetic message."
        # output = callOpenAI(prompt=prompt_complex, max_tokens=800, base64_image=images_encoded[(images_encoded['id'] == id)].encoded.values[0])
        output = callAnthropic(prompt=prompt_complex, max_tokens=800, base64_image=images_encoded[(images_encoded['id'] == id)].encoded.values[0])
        output_cleaned = callPostProcessing(prompt_parameter=output)
        
        # Save the output
        claude_complex_output[id] = output_cleaned
        
        # Save progress after each file
        with open('claude_complex_output_progress.json', 'w') as file:
            json.dump(claude_complex_output, file)
        
        print(f'------- Finished processing file {id} in {time.time() - start_time} seconds -------')

    except Exception as e:
        print(f"Error processing file {id}: {str(e)}")
        # Save the progress and exit the loop in case of an error
        with open('claude_complex_output_progress.json', 'w') as file:
            json.dump(claude_complex_output, file)
        break  # Exit the loop on error

# Once all IDs are processed, save the final result
with open('claude_complex_output_final.json', 'w') as file:
    json.dump(claude_complex_output, file)

print("Processing complete.")


------- Start processing file 18_0 -------
------- Finished processing file 18_0 in 5.452270030975342 seconds -------
------- Start processing file 19_0 -------
------- Finished processing file 19_0 in 5.630033016204834 seconds -------
Processing complete.


### Few-shots

In [118]:
df = pd.read_csv(path+'/data/transcriptions_perline_cleaned.csv', encoding='utf-8')
df.replace({u'\xa0': ' '}, regex=True, inplace=True)

In [119]:
example1 = images_encoded[images_encoded['id'] == '1_1'].encoded.values[0]
example2 = images_encoded[images_encoded['id'] == '1_3'].encoded.values[0]

In [120]:
images_encoded_oneshot = images_encoded[~images_encoded['id'].isin(['1_1'])]
images_encoded_twoshot = images_encoded[~images_encoded['id'].isin(['1_1', '1_3'])]

In [121]:
example1_text = df[df['id'] == '1_1'].text.values[0]
example2_text = df[df['id'] == '1_3'].text.values[0]

In [122]:
example_texts =  [example1_text,example2_text]

In [123]:
example_texts

['Arrêté le vingt huit octobre 1919 servais',
 '398 trente octobre Herrent Alphones gh Ophain 16 9b 1918 Herrent Maris & autres 2280 1045 1235 11 Db 1919 15 7bre 1919  7 avril 1920 303']

In [124]:
images_encoded_extexts = images_encoded[~images_encoded['id'].isin(['1_1', '1_3'])]

In [125]:
# prompt_example =  """
#     Recognize the texts from the image like the examples.
#     ```plaintext
#     """

In [126]:
# example1_text or exmple_texts
prompt_example_text = f"""
                        The ```plaintext block is the example transcription of the example image you saw:

                        Transcription:
                        ```plaintext
                        {example_texts}
                        ```
                        Compare what you read initially and the solution key in ```plaintext block. Recreate the content of the table in this image. Only that, no other information from you.

                        """


In [127]:
def callOpenAI_example(prompt, NExample=1, base64_image=None, max_tokens=5000):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_API_KEY}"
    } 
    model_vision = "gpt-4o"

    if NExample == 1:
        payload = {
            "model": model_vision, 
            "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant who can read old handwriting with a background in history, and you are going to recreate a scanned déclaration de succession from Belgium in a txt format."
                
            },
            {
                "role": "user",
                "content": [
                {
                    "type": "image_url",
                    "image_url": {
                    "url": f"data:image/jpeg;base64,{example1}"
                    }
                },
                {
                    "type": "text",
                    "text": example1_text
                },
                {
                    "type": "text",
                    "text": prompt
                },
                {
                    "type": "image_url",
                    "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
                ]
            }
            ],
            "max_tokens": max_tokens,
            "temperature": 0
        }
    if NExample == 2:
               payload = {
            "model": model_vision, 
            "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant who can read old handwriting with a background in history, and you are going to recreate a scanned déclaration de succession from Belgium in a txt format."
                
            },
            {
                "role": "user",
                "content": [
                {
                    "type": "image_url",
                    "image_url": {
                    "url": f"data:image/jpeg;base64,{example1}"
                    }
                },
                {
                    "type": "text",
                    "text": example1_text
                },
                {
                    "type": "image_url",
                    "image_url": {
                    "url": f"data:image/jpeg;base64,{example2}"
                    }
                },
                {
                    "type": "text",
                    "text": example2_text
                },
                {
                    "type": "text",
                    "text": prompt
                },
                {
                    "type": "image_url",
                    "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"
                    }
                }
                ]
            }
            ],
            "max_tokens": max_tokens,
            "temperature": 0
        }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    try:
        return response.json()["choices"][0]["message"]["content"]
    except:
        print(response.json()["error"]["message"])

In [128]:
def callAnthropic_example(prompt, NExample=1, base64_image=None, max_tokens=5000):
    if NExample == 1:
        response = anthropic_client.messages.create(
            model=MODEL_NAME,
            max_tokens=max_tokens,
            system = "You are a helpful assistant who can read old handwriting with a background in history, and you are going to recreate a scanned déclaration de succession from Belgium in a txt format.",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "image", 
                            "source": {
                                "type": "base64", 
                                "media_type": "image/jpeg", 
                                "data": example1}},
                        {
                            "type": "text",
                            "text": example1_text,
                        },
                        {
                            "type": "text",
                            "text": prompt,
                        },
                        {"type": "image", 
                            "source": {
                                "type": "base64", 
                                "media_type": "image/jpeg", 
                                "data": base64_image}}
                    ],
                }
            ],
            temperature=0,
        )
        
    if NExample == 2:
        response = anthropic_client.messages.create(
            model=MODEL_NAME,
            max_tokens=max_tokens,
            system = "You are a helpful assistant who can read old handwriting with a background in history, and you are going to recreate a scanned déclaration de succession from Belgium in a txt format.",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "image", 
                            "source": {
                                "type": "base64", 
                                "media_type": "image/jpeg", 
                                "data": example1}},
                        {
                            "type": "text",
                            "text": example1_text,
                        },
                        {"type": "image", 
                            "source": {
                                "type": "base64", 
                                "media_type": "image/jpeg", 
                                "data": example2}},
                        {
                            "type": "text",
                            "text": example2_text,
                        },
                        {
                            "type": "text",
                            "text": prompt,
                        },
                        {"type": "image", 
                            "source": {
                                "type": "base64", 
                                "media_type": "image/jpeg", 
                                "data": base64_image}}
                    ],
                }
            ],
            temperature=0,
        )
    return response.to_dict()["content"][0]["text"]

In [129]:
images_encoded_twoshot

Unnamed: 0,file,line,encoded,id
0,1,0,/9j/4QczRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_0
2,1,2,/9j/4QX5RXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_2
4,1,4,/9j/4QZLRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_4
5,1,5,/9j/4QVaRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_5
6,1,6,/9j/4QVqRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_6
...,...,...,...,...
278,20,9,/9j/4QY7RXhpZgAATU0AKgAAAAgADAEAAAMAAAABFkAAAA...,20_9
279,20,10,/9j/4QY3RXhpZgAATU0AKgAAAAgADAEAAAMAAAABFkAAAA...,20_10
280,20,11,/9j/4QWhRXhpZgAATU0AKgAAAAgADAEAAAMAAAABFkAAAA...,20_11
281,20,12,/9j/4QUtRXhpZgAATU0AKgAAAAgADAEAAAMAAAABFkAAAA...,20_12


In [134]:
images_encoded_twoshot

Unnamed: 0,file,line,encoded,id
0,1,0,/9j/4QczRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_0
2,1,2,/9j/4QX5RXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_2
4,1,4,/9j/4QZLRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_4
5,1,5,/9j/4QVaRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_5
6,1,6,/9j/4QVqRXhpZgAATU0AKgAAAAgADQEAAAMAAAABD5YAAA...,1_6
...,...,...,...,...
278,20,9,/9j/4QY7RXhpZgAATU0AKgAAAAgADAEAAAMAAAABFkAAAA...,20_9
279,20,10,/9j/4QY3RXhpZgAATU0AKgAAAAgADAEAAAMAAAABFkAAAA...,20_10
280,20,11,/9j/4QWhRXhpZgAATU0AKgAAAAgADAEAAAMAAAABFkAAAA...,20_11
281,20,12,/9j/4QUtRXhpZgAATU0AKgAAAAgADAEAAAMAAAABFkAAAA...,20_12


In [None]:
# Load previous progress if available
try:
    with open('gpt_two_example_output_progress.json', 'r') as file:
        gpt_two_example_output = json.load(file)
except FileNotFoundError:
    gpt_two_example_output = {}

# Loop through each unique id
for id in images_encoded_twoshot['id'].unique():
# for id in unable_ids:
    # Check if this ID is already processed (Skip this step if you want to re-process for unable_ids) ----------------
    if id in gpt_two_example_output:
        print(f"Skipping {id}, already processed.")
        continue
    
    start_time = time.time()
    try:
        print(f'------- Start processing file {id} -------')
        
        # Call OpenAI/Anthropic and post-processing functions
        prompt_example_text += "Even if it is hard to read the texts from the image, return as much as you can. You must read something. Do not return an apologetic message."
        output = callOpenAI_example(prompt=prompt_example_text, NExample=2, base64_image=images_encoded_twoshot[(images_encoded_twoshot['id'] == id)].encoded.values[0], max_tokens=800)
        # output = callAnthropic_example(prompt=prompt_example_text, NExample=1, base64_image=images_encoded_oneshot[(images_encoded_oneshot['id'] == id)].encoded.values[0], max_tokens=800)
        output_cleaned = callPostProcessing(prompt_parameter=output)
        
        # Save the output
        gpt_two_example_output[id] = output_cleaned
        
        # Save progress after each file
        with open('gpt_two_example_output_progress.json', 'w') as file:
            json.dump(gpt_two_example_output, file)
        
        print(f'------- Finished processing file {id} in {time.time() - start_time} seconds -------')

    except Exception as e:
        print(f"Error processing file {id}: {str(e)}")
        # Save the progress and exit the loop in case of an error
        with open('gpt_two_example_output_progress.json', 'w') as file:
            json.dump(gpt_two_example_output, file)
        break  # Exit the loop on error

# Once all IDs are processed, save the final result
with open('gpt_two_example_output_final.json', 'w') as file:
    json.dump(gpt_two_example_output, file)

print("Processing complete.")


------- Start processing file 1_0 -------
------- Finished processing file 1_0 in 12.108368158340454 seconds -------
------- Start processing file 1_2 -------
------- Finished processing file 1_2 in 3.0430891513824463 seconds -------
------- Start processing file 1_4 -------
------- Finished processing file 1_4 in 4.388018846511841 seconds -------
------- Start processing file 1_5 -------
------- Finished processing file 1_5 in 3.7039988040924072 seconds -------
------- Start processing file 1_6 -------
------- Finished processing file 1_6 in 4.339546203613281 seconds -------
------- Start processing file 1_7 -------
------- Finished processing file 1_7 in 6.516517162322998 seconds -------
------- Start processing file 1_8 -------
------- Finished processing file 1_8 in 3.1763272285461426 seconds -------
------- Start processing file 1_9 -------
------- Finished processing file 1_9 in 4.989033937454224 seconds -------
------- Start processing file 1_10 -------
------- Finished processi

### Refine

In [71]:
# gpt_simple = pd.read_csv(path+'/results/postprocessed/gpt_perline_output.csv')
# claude_simple =  pd.read_csv(path+'/results/postprocessed/claude_perline_output.csv')
gpt_complex = pd.read_csv(path+'/results/postprocessed/gpt_complex_perline_output2.csv')
claude_complex =  pd.read_csv(path+'/results/postprocessed/claude_complex_perline_output2.csv')

In [77]:
import time
import json

# Load previous progress if available
try:
    with open('claude_refine_complex_output_progress.json', 'r') as file:
        claude_refine_complex_output = json.load(file)
except FileNotFoundError:
    claude_refine_complex_output = {}

# Loop through each unique id
for id in images_encoded['id'].unique():
# for id in header_ids+typo_ids:
# for id in unable_ids:
    # Check if this ID is already processed (Skip this step if you want to re-process for unable_ids) ----------------
    if id in claude_refine_complex_output:
        print(f"Skipping {id}, already processed.")
        continue
    
    start_time = time.time()
    try:
        print(f'------- Start processing file {id} -------')
        
        # Call OpenAI/Anthropic and post-processing functions
        response_text = claude_complex[claude_complex['id'] == id].text.values[0]
        prompt_refine = f"""
        
        Your first draft:
        ```plaintext
        {response_text}
        ```

        Errors: 
        Your first transcription you made in ```plaintext block contains some errors.
        
        Task:
        Refine your first trasncription in ```plaintext block. 
        Make sure to read the names of the people and the location as well as the dates and the numbers correctly.
        Transcribe as you see in the image.
        ```plaintext
        """

        prompt_refine += "Even if it is hard to read the texts from the image, return as much as you can. You must read something. Do not return an apologetic message."
        # output = callOpenAI(prompt=prompt_refine, base64_image=images_encoded[(images_encoded['id'] == id)].encoded.values[0], max_tokens=800)
        output = callAnthropic(prompt=prompt_refine, base64_image=images_encoded[(images_encoded['id'] == id)].encoded.values[0], max_tokens=800)
        output_cleaned = callPostProcessing(prompt_parameter=output)
        
        # Save the output
        claude_refine_complex_output[id] = output_cleaned
        
        # Save progress after each file
        with open('claude_refine_complex_output_progress.json', 'w') as file:
            json.dump(claude_refine_complex_output, file)
        
        print(f'------- Finished processing file {id} in {time.time() - start_time} seconds -------')

    except Exception as e:
        print(f"Error processing file {id}: {str(e)}")
        # Save the progress and exit the loop in case of an error
        with open('claude_refine_complex_output_progress.json', 'w') as file:
            json.dump(claude_refine_complex_output, file)
        break  # Exit the loop on error

# Once all IDs are processed, save the final result
with open('claude_refine_complex_output_final.json', 'w') as file:
    json.dump(claude_refine_complex_output, file)

print("Processing complete.")


Skipping 1_0, already processed.
Skipping 1_1, already processed.
Skipping 1_2, already processed.
Skipping 1_3, already processed.
Skipping 1_4, already processed.
Skipping 1_5, already processed.
Skipping 1_6, already processed.
Skipping 1_7, already processed.
Skipping 1_8, already processed.
Skipping 1_9, already processed.
Skipping 1_10, already processed.
Skipping 1_11, already processed.
Skipping 1_12, already processed.
Skipping 1_13, already processed.
Skipping 2_0, already processed.
Skipping 2_1, already processed.
Skipping 2_2, already processed.
Skipping 2_3, already processed.
Skipping 2_4, already processed.
Skipping 2_5, already processed.
Skipping 2_6, already processed.
Skipping 2_7, already processed.
Skipping 2_8, already processed.
Skipping 2_9, already processed.
Skipping 2_10, already processed.
Skipping 2_11, already processed.
Skipping 2_12, already processed.
Skipping 2_13, already processed.
Skipping 2_14, already processed.
Skipping 3_0, already processed.
S

### Outputs

In [133]:
with open('gpt_two_example_output_progress.json', 'r') as file:
        gpt_two_example_output = json.load(file)
# claude_complex_output
gpt_two_example_output

{'1_0': 'N° DATE DU DÉPÔT DÉSIGNATION DES PERSONNES DÉCÉDÉES, OU ABSENTES. DATE DU DÉCÈS NOMS, PRÉNOMS DROITS DE SUCCESSION DROIT DE MUTATION NUMÉROS DATE NUMÉROS OBSERVATIONS. d’ordre des NOMS. PRÉNOMS DOMICILES ou du et de des de la DÉCLARATIONS. JUGEMENT D’ENVOI DEMEURES DES PARTIES DÉCLARANTES. MUTATION EN LIGNE DIRECTE. par décès. DÉCLARATIONS en possession, ACTIF. PASSIF. RESTANT NET. VALEUR des IMMEUBLES. des de l’expiration de la consignation des DROITS ET AMENDES. CAUTIONNEMENTS. de la consignation en cas d’absence. (2) (2) (2) (2) DÉCLARATIONS de l’exigibilité des droits DATE N° au sommaire n° 25. au sommaire n° 30.',
 '1_3': '398 trente octobre Herrent Alphonse fils Oplinter 29 7bre 1919 Heverlee Louis et autres 2230 504 225 11 7bre 1919 Hasselt 363',
 '1_4': "398 bis 2 Lesévre Jules Braîne-l'Alleud 5 janvier 1919 Brodis Thérèse 2222 2222 236 1919",
 '1_5': 'Arrêté le trente octobre 1919 Servais',
 '1_6': 'Arrêté le trente un octobre 1919 Servais',
 '1_7': 'Arrêté le premier

In [117]:
unable_ids = [id for id, content in claude_complex_output.items() if "unable" in content or "I apologize" in content or "The image" in content or "sorry" in content]
print(unable_ids, len(unable_ids), sep='\n')

['18_0', '19_0']
2


### To run with the saved json

In [82]:
claude_refine_complex_output_df = pd.DataFrame(claude_refine_complex_output.items(), columns=['id', 'text'])
claude_refine_complex_output_df['text'] = claude_refine_complex_output_df['text'].replace(['\n', '\t'], ' ', regex=True)
claude_refine_complex_output_df

Unnamed: 0,id,text
0,1_0,N° DATE DU DÉPÔT des déclarations DÉSIGNATION ...
1,2_0,N° DATE DU DÉPÔT DÉSIGNATION DES PERSONNES DÉC...
2,3_0,N° DATE DU DÉPÔT DÉSIGNATION DES PERSONNES DÉC...
3,4_0,DÉSIGNATION DES PERSONNES DÉCÉDÉES OU ABSENTES...
4,5_0,N° DATE DU DÉPÔT DÉSIGNATION DES PERSONNES DÉC...
...,...,...
278,20_9,10 décembre 30 Pétriaux Camile Morville 22 avr...
279,20_10,10² 5 Dubois Alexandre épicier 5/8/1919 Dubois...
280,20_11,Arrêté le dix neuf février 1920 Servais
281,20_12,Arrêté le vingt février 1920 Servais


In [83]:
claude_refine_complex_output_df.to_csv(path+'/results/postprocessed/claude_refine_complex_perline_output2.csv', index=False)

# CER/BLEU calculation

## ground truth df

In [110]:
from glob import glob

text_path = path+'/data/transcriptions'
file_list = glob(os.path.join(text_path, 'transcription_ex*.txt'))

df_list = []

for file in file_list:
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()
    lines = content.split('\n')

    df = pd.DataFrame({
        'line': range(0, len(lines)),  # Line numbers starting from 0
        'text': lines
    })
    
    name = os.path.basename(file)
    name = name.split('.')[0]
    df['file'] = name.split('ex')[1]
    df['file'] = df['file'].astype(int)
    df_list.append(df)
df = pd.concat(df_list, ignore_index=True)

In [111]:
df['text'] = df['text'].replace(['\n', '\t'], ' ', regex=True)
df = df.sort_values(by=['file', 'line']).reset_index(drop=True)
df

Unnamed: 0,line,text,file
0,0,N' d'ordre Date du dépot des déclarations Dési...,1
1,1,Nom. Prénoms Domiciles Actif. (2) Passif. (2) ...,1
2,2,arrêté le vingt huit octobre 1919 servais ...,1
3,3,arrêté le vingt neuf octobre 1919 servais ...,1
4,4,398 trente octobre Herrent Alphones gh Ophain ...,1
...,...,...,...
298,10,19 dix neuf d Pétriaux Coralie Nivelles 22 av...,20
299,11,19^2 d Dubois Alexandre Quenast 7b 1919 Dubois...,20
300,12,Arrêté le dix neuf février 1920 servais,20
301,13,Arrêté le vingt février 1920 servais,20


In [112]:
# Merge the text values of line number 0 and 1 (the two lines of the header)
for file in df['file'].unique():
    header_lines = df[(df['file'] == file) & (df['line'].isin([0, 1]))]
    df.loc[header_lines.index[0], 'text'] = header_lines.iloc[0]['text'] + " " + header_lines.iloc[1]['text']
df = df[df['line'] != 1].reset_index(drop=True)

In [113]:
df.loc[df['line'] != 0, 'line'] -= 1  # Adjust line numbers after removing the second line of the header

In [30]:
# # for file 6, two lines are used for some column.. we need to merge them
# doubled_line = df[(df['file'] == 6) & (df['line'].isin([3, 4]))]
# df.loc[doubled_line.index[0], 'text'] = doubled_line.iloc[0]['text'] + " " + doubled_line.iloc[1]['text']
# df.drop(doubled_line.index[1], inplace=True)
# df.loc[(df['file'] == 6) & (df['line'] > 4), 'line'] -= 1


In [114]:
df[df['file']==1]

Unnamed: 0,line,text,file
0,0,N' d'ordre Date du dépot des déclarations Dési...,1
1,1,arrêté le vingt huit octobre 1919 servais ...,1
2,2,arrêté le vingt neuf octobre 1919 servais ...,1
3,3,398 trente octobre Herrent Alphones gh Ophain ...,1
4,4,398^2 d Lefévre Jules Braine l'Alleud 8 Janvie...,1
5,5,arrêté le trente octobre 1919 servais ...,1
6,6,arrêté le trente un octobre 1919 servais ...,1
7,7,arrêté le premier novembre 1919 Toussaint ser...,1
8,8,arrêté le deux novembre 1919 Dimanche servais...,1
9,9,399 trois 9bre Desmedt Jeanne Nivelles 13 mai ...,1


In [115]:
df['id'] = df['file'].astype(str) + '_' + df['line'].astype(str)
df

Unnamed: 0,line,text,file,id
0,0,N' d'ordre Date du dépot des déclarations Dési...,1,1_0
1,1,arrêté le vingt huit octobre 1919 servais ...,1,1_1
2,2,arrêté le vingt neuf octobre 1919 servais ...,1,1_2
3,3,398 trente octobre Herrent Alphones gh Ophain ...,1,1_3
4,4,398^2 d Lefévre Jules Braine l'Alleud 8 Janvie...,1,1_4
...,...,...,...,...
278,9,19 dix neuf d Pétriaux Coralie Nivelles 22 av...,20,20_9
279,10,19^2 d Dubois Alexandre Quenast 7b 1919 Dubois...,20,20_10
280,11,Arrêté le dix neuf février 1920 servais,20,20_11
281,12,Arrêté le vingt février 1920 servais,20,20_12


In [116]:
for file in df['file'].unique():
    last_line = df[df['file'] == file]['line'].max()
    print(f"File: {file}, Last Line: {last_line}")

File: 1, Last Line: 13
File: 2, Last Line: 14
File: 3, Last Line: 13
File: 4, Last Line: 13
File: 5, Last Line: 14
File: 6, Last Line: 14
File: 7, Last Line: 13
File: 8, Last Line: 13
File: 9, Last Line: 13
File: 10, Last Line: 13
File: 11, Last Line: 13
File: 12, Last Line: 13
File: 13, Last Line: 13
File: 14, Last Line: 13
File: 15, Last Line: 13
File: 16, Last Line: 13
File: 17, Last Line: 13
File: 18, Last Line: 13
File: 19, Last Line: 13
File: 20, Last Line: 13


In [117]:
df.to_csv(path+'/data/transcription_perline_text.csv', index=False)

In [88]:
print(df['id'].nunique(), claude_output_df['id'].nunique())

283 283


## Calculate

In [6]:
df = pd.read_csv(path+'/data/transcription_perline_text_whitespace-trimmed.csv')

In [7]:
from evaluate import load

cer_metric =load("cer")
bleu_metric = load("bleu")  

In [9]:
from glob import glob

files = glob(os.path.join(path+'/results/postprocessed/per-line_experiments', '*.csv'))

In [8]:

bleu_gpt = {}
cer_gpt = {}

for id in df_filtered['id'].unique():
    # Extract the text as a single string, not as an array
    pred_text = pred[pred['id'] == id]['text'].values[0]
    ref_text = df_filtered[df_filtered['id'] == id]['text'].values[0]

    # Ensure the predictions and references are passed as a list of strings
    if pred_text and ref_text:  # Check if both texts are not empty (which happens for some OCR outputs)
        bleu_gpt[id] = bleu_metric.compute(predictions=[pred_text], references=[ref_text])
    else:
        bleu_gpt[id] = {'bleu': 0.0}  # Assign a default value if texts are empty
    cer_gpt[id] = cer_metric.compute(predictions=[pred_text], references=[ref_text])



In [10]:
files

['/Users/serenekim/Desktop/PhD/img-analysis_seorin_project/results/postprocessed/per-line_experiments/gpt_one_example_perline_output.csv',
 '/Users/serenekim/Desktop/PhD/img-analysis_seorin_project/results/postprocessed/per-line_experiments/claude_two_example_perline_output.csv',
 '/Users/serenekim/Desktop/PhD/img-analysis_seorin_project/results/postprocessed/per-line_experiments/pytesseractOCR_perline_output.csv',
 '/Users/serenekim/Desktop/PhD/img-analysis_seorin_project/results/postprocessed/per-line_experiments/claude_two_text_example_perline_output.csv',
 '/Users/serenekim/Desktop/PhD/img-analysis_seorin_project/results/postprocessed/per-line_experiments/claude_complex_perline_output.csv',
 '/Users/serenekim/Desktop/PhD/img-analysis_seorin_project/results/postprocessed/per-line_experiments/gpt_one_text_example_perline_output.csv',
 '/Users/serenekim/Desktop/PhD/img-analysis_seorin_project/results/postprocessed/per-line_experiments/claude_refine_perline_output.csv',
 '/Users/serene

In [86]:
import unidecode

bleu_perline = pd.DataFrame()
cer_perline = pd.DataFrame()

for file in files:
    pred = pd.read_csv(file)
    df_filtered = df[df['id'].isin(pred['id'])]

    name = os.path.basename(file)
    name = name.split('_perline')[0]

    print(f"Processing {name}...")

    bleu_scores = []  # List to accumulate BLEU scores for this model
    cer_scores = []  # List to accumulate CER scores for this model

    for id in df_filtered['id'].unique():
        # Extract the text as a single string, not as an array
        pred_text = pred[pred['id'] == id]['text'].values
        ref_text = df_filtered[df_filtered['id'] == id]['text'].values

        # Ensure the predictions and references are passed as a list of strings
        if len(pred_text) > 0 and len(ref_text) > 0:  # Check if both texts are not empty
            pred_text = pred_text[0]
            ref_text = ref_text[0]

            # Check for NaN values 
            if pd.notna(pred_text) and pd.notna(ref_text):
                # Strip white spaces
                pred_text = pred_text.strip()
                ref_text = ref_text.strip()
                # Normalize: uncapitalize and remove accents (Try 3 different normalizations)
                # pred_text = pred_text.lower()
                # ref_text = ref_text.lower()
                # pred_text = unidecode.unidecode(pred_text)
                # ref_text = unidecode.unidecode(ref_text)
                # pred_text = unidecode.unidecode(pred_text).lower()
                # ref_text = unidecode.unidecode(ref_text).lower()

                # Ensure texts are not empty after stripping
                if pred_text and ref_text:
                    bleu_metrics = bleu_metric.compute(predictions=[pred_text], references=[ref_text], max_order=3)
                    cer_metrics = cer_metric.compute(predictions=[pred_text], references=[ref_text])
                else:
                    bleu_metrics = {'bleu': 0.0}  # Assign a default value if texts are empty
                    cer_metrics = 1.0
            else:
                bleu_metrics = {'bleu': 0.0}  # Assign a default value if texts are NaN
                cer_metrics = 1.0
        else:
            bleu_metrics = {'bleu': 0.0}  # Assign a default value if texts are empty
            cer_metrics = 1.0

        bleu_scores.append({
                'model': name,
                'id': id,
                **bleu_metrics
            })
        cer_scores.append({
                'model': name,
                'id': id,
                'cer': cer_metrics
            })

    bleu_perline = pd.concat([bleu_perline, pd.DataFrame(bleu_scores)], ignore_index=True)
    cer_perline = pd.concat([cer_perline, pd.DataFrame(cer_scores)], ignore_index=True)


Processing gpt_one_example...
Processing claude_two_example...
Processing pytesseractOCR...
Processing claude_two_text_example...
Processing claude_complex...
Processing gpt_one_text_example...
Processing claude_refine...
Processing gpt_refine_complex_output...
Processing claude_refine_complex_output...
Processing gpt...
Processing kerasOCR...
Processing trOCR...
Processing claude...
Processing gpt_refine...
Processing claude_one_example...
Processing gpt_two_example...
Processing gpt_two_text_example...
Processing gpt_complex...
Processing easyOCR...
Processing claude_one_text_example...


In [87]:
cer_perline

Unnamed: 0,model,id,cer
0,gpt_one_example,1_0,0.958180
1,gpt_one_example,1_2,0.121951
2,gpt_one_example,1_3,0.822222
3,gpt_one_example,1_4,0.804598
4,gpt_one_example,1_5,0.270270
...,...,...,...
5643,claude_one_text_example,20_9,0.341463
5644,claude_one_text_example,20_10,0.448276
5645,claude_one_text_example,20_11,0.025641
5646,claude_one_text_example,20_12,0.388889


In [88]:
bleu_perline.replace({'model': {'gpt': 'gpt_simple', 'claude': 'claude_simple',
                               'trOCR': 'TrOCR', 'pytesseractOCR': 'Pytesseract',
                               'kerasOCR': 'KerasOCR', 'easyOCR': 'EasyOCR',
                               'gpt_refine_complex_output': 'gpt_refine_complex', 'claude_refine_complex_output': 'claude_refine_complex'}}, inplace=True)
cer_perline.replace({'model': {'gpt': 'gpt_simple', 'claude': 'claude_simple',
                               'trOCR': 'TrOCR', 'pytesseractOCR': 'Pytesseract',
                               'kerasOCR': 'KerasOCR', 'easyOCR': 'EasyOCR',
                               'gpt_refine_complex_output': 'gpt_refine_complex', 'claude_refine_complex_output': 'claude_refine_complex'}}, inplace=True)


In [89]:
bleu_perline.to_csv(path+'/results/scores_comparisons/bleu_perline_all_n3.csv', index=False)
cer_perline.to_csv(path+'/results/scores_comparisons/cer_perline_all.csv', index=False)

### BLEU

In [69]:
bleu_metric.compute(predictions=['Arrêté le vingt cinq novembre 1919 Servais'], references=['Arrêté le vingt cinq novembre 1919 servais'])

{'bleu': 0.8091067115702212,
 'precisions': [0.8571428571428571, 0.8333333333333334, 0.8, 0.75],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0,
 'translation_length': 7,
 'reference_length': 7}

In [64]:
bleu_gpt = pd.DataFrame(bleu_gpt).T

In [65]:
bleu_gpt['id'] = bleu_gpt.index
bleu_gpt

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length,id
1_0,0.0,"[0.025, 0.0, 0.0, 0.0]",0.06081,0.263158,40,152,1_0
1_1,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,7,7,1_1
1_2,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,7,7,1_2
1_3,0.137596,"[0.38461538461538464, 0.2, 0.125, 0.0434782608...",0.962269,0.962963,26,27,1_3
1_4,0.0,"[0.1, 0.0, 0.0, 0.0]",0.449329,0.555556,10,18,1_4
...,...,...,...,...,...,...,...
20_9,0.259849,"[0.5454545454545454, 0.38095238095238093, 0.25...",0.955563,0.956522,22,23,20_9
20_10,0.150923,"[0.45, 0.2631578947368421, 0.1111111111111111,...",0.904837,0.909091,20,22,20_10
20_11,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,7,7,20_11
20_12,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,6,6,20_12


In [52]:
bleu_gpt.to_csv(path+'/results/scores_comparisons/eval_perline/bleu_claude_two_example_perline.csv', index=False)

### CER

In [53]:
cer_gpt = pd.DataFrame(cer_gpt.items(), columns=['id', 'cer'])

In [54]:
print(cer_gpt['cer'].mean(), cer_gpt['cer'].var())

0.5713106915175424 2.935498085710415


In [55]:
cer_gpt.to_csv(path+'/results/scores_comparisons/eval_perline/cer_claude_two_example_perline.csv', float_format="%.6f", index=False)

# OCR

## EasyOCR

In [34]:
test_image = cv2.imread(test_path)
plt.imshow(test_image)

NameError: name 'test_path' is not defined

In [128]:
def easyOCR(image_path):
    reader = easyocr.Reader(['fr'])
    img = cv2.imread(image_path)
    results = reader.readtext(img)
    output = []
    for res in results:
        det, conf = res[1], res[2]
        output.append((det, round(conf, 2))) 
    text = ' '.join([i[0] for i in output])
    return text

In [129]:
easyOCR_output = {}
for image in os.listdir(image_folder):
    if image.endswith('.jpg'):
        image_path = image_folder + '/' + image
        text = easyOCR(image_path)
        name = image.split('.')[0]
        name = name.split('example')[1]
        easyOCR_output[name] = text

In [43]:
# easyOCR_output_df = pd.read_csv(path+'/results/postprocessed/easyOCR_perline_output.csv')
easyOCR_output_df

Unnamed: 0,file,text,file_name,line_name,id
0,1_00,DÉSIGNATION DES PERSONNES DÉCÉDÉES OU AlSENTES...,1,0,1_0
1,1_01,~Bcrta` 8 oetolz 1919 d4earuey vicytAul,1,1,1_1
2,1_02,Jbsucala & veyhmeuf ouoba tg19 [eevœy,1,2,1_2
3,1_03,891 ta HBevcenk ~Bepkonssjk oj hain Hgoucoal Y...,1,3,1_3
4,1_04,"TulL Bouuù Q ""Janer ~aobà Bhuile RRXR 26 aplul...",1,4,1_4
...,...,...,...,...,...
278,20_09,J9 ùcà nuf> Sebiaw bo2nbi YÉvepQu X anel Bebel...,20,9,20_9
279,20_10,4 49 0 : @ubovs ssexanbz Yuemaut ubuùd *ean [l...,20,10,20_10
280,20_11,Jvuté & oi = neuf fasles19:0 Huclai,20,11,20_11
281,20_12,Jarsalé - vms] Hinsenq %0 djeceia |,20,12,20_12


In [136]:
easyOCR_output_df = pd.DataFrame(easyOCR_output.items(), columns=['file', 'text'])
easyOCR_output_df[['file_name', 'line_name']] = easyOCR_output_df['file'].str.split('_', expand=True)
easyOCR_output_df[['file_name', 'line_name']] = easyOCR_output_df[['file_name', 'line_name']].astype(int)
easyOCR_output_df = easyOCR_output_df.sort_values(by=['file_name', 'line_name']).reset_index(drop=True)
easyOCR_output_df['text'] = easyOCR_output_df['text'].replace(['\n', '\t'], ' ', regex=True)
easyOCR_output_df['id'] = easyOCR_output_df['file_name'].astype(str) + '_' + easyOCR_output_df['line_name'].astype(str)
easyOCR_output_df

Unnamed: 0,file,text
0,10_00,DATE I IÉcis DROITS DF SUCCESSION DROIT NUMÉRO...
1,10_01,soceti & tù déeemebza. 919 Yuepiy
2,10_02,5 1439 DaLenlize Yiceppu #9lugu | Benuue YLama...
3,10_03,Jaxat' € deeemlaac919 Fuupùa quebu
4,10_04,[4ho ceæy _ (ekalque Pnag;nl Yjuuy Wv&ezlbz (...
...,...,...
278,9_09,69*2.4 Scinllane Pots+a Gxz9& SasBBoe Gpmzeyen...
279,9_10,"kag' 0: Sainllane Bwun' à 26r' 1sr ""9"
280,9_11,Joaak + fnmauu dceehu 1919 Yeoeok
281,9_12,[4Jg %eu 3- CBaslice fe At Z8ma 2e+eygu | Jwti...


In [44]:
easyOCR_output_df.to_csv(path+'/results/postprocessed/easyOCR_perline_output.csv', index=False)

## Pytesseract

In [10]:
def pytesseractOCR(image_path):
    try:
        image = PILImage.open(image_path)
        text = pytesseract.image_to_string(image)
        return text
    except:
        print("[ERROR] pytesseractOCR failed! (should be installed)")
        return ""

In [11]:
pytesseractOCR_output = {}
for image in os.listdir(image_folder):
    if image.endswith('.jpg'):
        image_path = image_folder + '/' + image
        text = pytesseractOCR(image_path)
        name = image.split('.')[0]
        name = name.split('example')[1]
        pytesseractOCR_output[name] = text

In [55]:
pytesseractOCR_output_df = pd.DataFrame(pytesseractOCR_output.items(), columns=['file', 'text'])
pytesseractOCR_output_df[['file_name', 'line_name']] = pytesseractOCR_output_df['file'].str.split('_', expand=True)
pytesseractOCR_output_df[['file_name', 'line_name']] = pytesseractOCR_output_df[['file_name', 'line_name']].astype(int)
pytesseractOCR_output_df = pytesseractOCR_output_df.sort_values(by=['file_name', 'line_name']).reset_index(drop=True)
pytesseractOCR_output_df['text'] = pytesseractOCR_output_df['text'].replace(['\n', '\t'], ' ', regex=True)
pytesseractOCR_output_df['id'] = pytesseractOCR_output_df['file_name'].astype(str) + '_' + pytesseractOCR_output_df['line_name'].astype(str)
pytesseractOCR_output_df

Unnamed: 0,file,text,file_name,line_name,id
0,1_00,| = | DATE DU DEPOT des DECLARATIONS. DESI...,1,0,1_0
1,1_01,ft alt alta,1,1,1_1
2,1_02,,1,2,1_2
3,1_03,a cnte |Abevcenk a dette Son <a 1040’ i ee ...,1,3,1_3
4,1_04,L 3 be oi 7 Nf »- p,1,4,1_4
...,...,...,...,...,...
278,20_09,149 28 auf Si elaiawx Ve | | Wvebly eu ty ...,20,9,20_9
279,20_10,; a : oe ssa song o Sannin nomena ie 3 (0....,20,10,20_10
280,20_11,| aul,20,11,20_11
281,20_12,Caen torah Winéorg ty dieser’ es oe aaa. pa...,20,12,20_12


In [56]:
pytesseractOCR_output_df.to_csv(path+'/results/postprocessed/pytesseractOCR_perline_output.csv', index=False)

## Keras

Not good for non-english?

In [139]:
def kerasOCR(image_path):
    pipeline = keras_ocr.pipeline.Pipeline()
    image = keras_ocr.tools.read(image_path)
    prediction_groups = pipeline.recognize([image])
    words = []
    for line in prediction_groups[0]:
        for word in line:
            try:
                if isinstance(word[0], str):
                    words.append(word[0])
            except IndexError:
                continue
    text = ' '.join(words)
    return text

In [None]:
kerasOCR_output = {}
for image in os.listdir(image_folder):
    if image.endswith('.jpg'):
        image_path = image_folder + '/' + image
        text = kerasOCR(image_path)
        name = image.split('.')[0]
        name = name.split('example')[1]
        kerasOCR_output[name] = text

In [173]:
test_keras = kerasOCR(image_path=test_path)
print(test_keras)

Looking for /Users/serenekim/.keras-ocr/craft_mlt_25k.h5
Instructions for updating:
Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.
Looking for /Users/serenekim/.keras-ocr/crnn_kurapan.h5
d r p o a g


## TrOCR

In [26]:
from transformers import TrOCRProcessor
from transformers import VisionEncoderDecoderModel
import torch

def trOCR(image_path):
    processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
    model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

    image = PILImage.open(image_path)

    pixel_values = processor(image, return_tensors="pt").pixel_values
    
    # Set device (GPU or CPU)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = model.to(device)  # Move model to the device
    pixel_values = pixel_values.to(device)  # Move image tensor to the same device
    
    try:
        generated_ids = model.generate(pixel_values, max_length=400)  # Limit max length
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return generated_text
    except IndexError as e:
        print(f"IndexError: {e}")
        return "Error: Index out of range during generation."

In [27]:
trOCR_output = {}
for image in os.listdir(image_folder):
    if image.endswith('.jpg'):
        image_path = image_folder + '/' + image
        text = trOCR(image_path)
        name = image.split('.')[0]
        name = name.split('example')[1]
        trOCR_output[name] = text

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of VisionEncode

In [31]:
trOCR_output_df = pd.DataFrame(trOCR_output.items(), columns=['file', 'text'])
trOCR_output_df[['file_name', 'line_name']] = trOCR_output_df['file'].str.split('_', expand=True)
trOCR_output_df[['file_name', 'line_name']] = trOCR_output_df[['file_name', 'line_name']].astype(int)
trOCR_output_df = trOCR_output_df.sort_values(by=['file_name', 'line_name']).reset_index(drop=True)
trOCR_output_df['text'] = trOCR_output_df['text'].replace(['\n', '\t'], ' ', regex=True)
trOCR_output_df['id'] = trOCR_output_df['file_name'].astype(str) + '_' + trOCR_output_df['line_name'].astype(str)
trOCR_output_df

Unnamed: 0,file,text,file_name,line_name,id
0,1_00,treat of the first time of the French Parliame...,1,0,1_0
1,1_01,# almost be weighted rather any standard for t...,1,1,1_1
2,1_02,# almost the original module you formerly ... ...,1,2,1_2
3,1_03,"THE GREAT BRONDSOME "" AIRMARK GABIT PARADE HAN...",1,3,1_3
4,1_04,After Congress plan himself tough back down to...,1,4,1_4
...,...,...,...,...,...
278,20_09,Manager Atkinson had made many awareness of th...,20,9,20_9
279,20_10,After the Democratic gubernatorial judge took ...,20,10,20_10
280,20_11,the best time of fourteen songs with the first...,20,11,20_11
281,20_12,""" To absorb confidence being a total of 1 000 ...",20,12,20_12


In [32]:
trOCR_output_df.to_csv(path+'/results/postprocessed/trOCR_perline_output.csv', index=False)