In [1]:
from openai import OpenAI
import os
import re
import pandas as pd
from dotenv import load_dotenv
import requests
import base64
import subprocess
from IPython.display import display, Image
from PIL import Image as PILImage

In [2]:
import cv2
# import easyocr
import matplotlib.pyplot as plt
# import pytesseract
# import keras_ocr

In [3]:
# OpenAI
load_dotenv() #get the environment 
openai_API_KEY = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=openai_API_KEY)

In [4]:
path = os.path.dirname(os.getcwd()) # Parent directory
ouput_folder = path+'/results/predictions'

In [7]:
def get_outputs(output_path):   
    outputs = {}
    for output in os.listdir(output_path):
        # Check if the current item is a file and ends with '.txt'
        file_path = os.path.join(output_path, output)
        if os.path.isfile(file_path) and output.endswith('.txt'):
            filename = output.split('.')[0]
            # Open the individual file and read its content
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            outputs[filename] = text  # Store the filename and text in the dictionary
    return outputs

In [45]:
models = ['claude-3-5-sonnet-20240620', 'EasyOCR', 'gpt-4o', 'KerasOCR', 'Pytesseract', 'trOCR']
prompts = ['one-example_prompt', 'two-example_prompt', 'zero-shot_complex-prompt', 'zero-shot_simple-prompt',
                'refine_complex-prompt']
outputs = {}
for prompt in prompts:
    outputs[prompt] = {}  # Initialize the dictionary for each prompt
    for model in models:
        output_path = f'{ouput_folder}/{prompt}/{model}'
        outputs[prompt][model] = get_outputs(output_path)

In [47]:
outputs.keys()

dict_keys(['one-example_prompt', 'two-example_prompt', 'zero-shot_complex-prompt', 'zero-shot_simple-prompt', 'refine_complex-prompt'])

In [25]:
def callPostProcessing(max_tokens=800, prompt_parameter = None):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_API_KEY}"
    } 
    payload = {
        "model": "gpt-4o",
        "messages": [
        {
            "role": "user",
            "content": [
            {
                "type": "text",
                "text": f"""This is an output from you. Clean it such that we only have the table without any separators, no comment from you: {prompt_parameter}
                """
            }
            ]
        }
        ],
        "max_tokens": max_tokens,
        "temperature": 0
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    try:
        return response.json()["choices"][0]["message"]["content"]
    except:
        print(response.json()["error"]["message"])

In [53]:
print(outputs[prompt][model]['transcription1'])

Here's a recreation of the content of the table in the image:

| N° | DATE DU DÉCÈS | DÉSIGNATION DES PERSONNES DÉCÉDÉES OU ABSENTES | DATE DE DÉCÈS ou de la disparition | NOMS, PRÉNOMS des héritiers ou légataires | DROITS DE SUCCESSION | DROIT DE MUTATION | NUMÉROS du sommier | DATE | RECETTE | OBSERVATIONS |
|---|---------------|------------------------------------------------|-----------------------------------|-------------------------------------------|----------------------|---------------------|---------------------|------|---------|---------------|
| | | NOM | PRÉNOMS | DOMICILE | | | ACTIF | PASSIF | RESTANT NET | VALEUR | de l'expiration du délai de rectification | de l'exigibilité des droits | DATE | N° | |
| 403 | quatre 9bre | Payot | Antoinette | Braine l'Alleud | 14 mai 1919 | Payot Marie & autres | 16975 | 2950 | 11740 | 2011/1921 | 16 Db 1919 | 14 mars 1928 | | | |
| 403² | d° | Paulus | Mélanie | Nivelles | 28 Janvier 1919 | Paulus François | 1971 | | 1971 | 191/1919 

In [54]:
# Immediately save the post-processed outputs

# for prompt in prompts:
#     for model in models:
#         for key in outputs[prompt][model].keys():
#             post = callPostProcessing(prompt_parameter=outputs[prompt][model][key])
#             output_dir = os.path.join(path, 'results', 'postprocessed', prompt, model)
#             os.makedirs(output_dir, exist_ok=True)
#             file_path = os.path.join(output_dir, f"{key}.txt")
#             with open(file_path, 'w', encoding='utf-8') as file:
#                 file.write(post)

# Time: 59m 4.8s

* trOCR's outputs didn't get postprocessed well. Stil separators exsit. 


# New BLEU Scores

In [5]:
from evaluate import load

cer_metric =load("cer")
bleu_metric = load("bleu")  

In [20]:
models = ['claude-3-5-sonnet-20240620', 'gpt-4o']
# models = ['claude-3-5-sonnet-20240620', 'EasyOCR', 'gpt-4o', 'KerasOCR', 'Pytesseract', 'trOCR']

prompts = ['one-example_prompt', 'two-example_prompt', 'zero-shot_complex-prompt', 'zero-shot_simple-prompt',
                'refine_complex-prompt']
predictions = {}
for prompt in prompts:
    predictions[prompt] = {}  
    for model in models:
        predictions_path = f'{path}/results/postprocessed/{prompt}/{model}'
        predictions[prompt][model] = get_outputs(predictions_path)

In [23]:
# We should disregard the transcriptions that were used as examples!! 

for prompt in ['one-example_prompt', 'two-example_prompt']:
    if predictions.get(prompt):
        for model in models:
            if prompt == 'one-example_prompt':
                predictions[prompt][model].pop('transcription2', None)  # Remove only 'transcription2'
            elif prompt == 'two-example_prompt':
                for key in ['transcription2', 'transcription3']:
                    predictions[prompt][model].pop(key, None)  # Remove both keys


In [24]:
len(predictions['one-example_prompt']['claude-3-5-sonnet-20240620'])

19

In [25]:
references = {}

for ref in os.listdir(f'{path}/data/transcriptions'):
    ref_path = f'{path}/data/transcriptions/{ref}'
    if ref.endswith('.txt'):
        name = ref.split('.')[0]
        name = int(name.split('ex')[1])-1
        ref = f"transcription{name}"
        print(ref)
        with open(ref_path, 'r', encoding='utf-8') as f:
            text = f.read()
        references[ref] = text

transcription16
transcription15
transcription13
transcription14
transcription10
transcription9
transcription11
transcription12
transcription7
transcription8
transcription3
transcription4
transcription6
transcription5
transcription1
transcription2
transcription0
transcription19
transcription17
transcription18


In [29]:
bleu_llms = {}
cer_llms = {}

for prompt in prompts:
    for model in models:
        for key in predictions[prompt][model].keys():
            # print(f"Prompt: {prompt}, Model: {model}, Key: {key}") #To check the keys
            bleu_llms[prompt, model] = bleu_metric.compute(predictions=[predictions[prompt][model][key]], references=[references[key]])
            cer_llms[prompt, model] = cer_metric.compute(predictions=[predictions[prompt][model][key]], references=[references[key]])


Prompt: one-example_prompt, Model: claude-3-5-sonnet-20240620, Key: transcription18
Prompt: one-example_prompt, Model: claude-3-5-sonnet-20240620, Key: transcription19
Prompt: one-example_prompt, Model: claude-3-5-sonnet-20240620, Key: transcription8
Prompt: one-example_prompt, Model: claude-3-5-sonnet-20240620, Key: transcription9
Prompt: one-example_prompt, Model: claude-3-5-sonnet-20240620, Key: transcription1
Prompt: one-example_prompt, Model: claude-3-5-sonnet-20240620, Key: transcription0
Prompt: one-example_prompt, Model: claude-3-5-sonnet-20240620, Key: transcription3
Prompt: one-example_prompt, Model: claude-3-5-sonnet-20240620, Key: transcription7
Prompt: one-example_prompt, Model: claude-3-5-sonnet-20240620, Key: transcription6
Prompt: one-example_prompt, Model: claude-3-5-sonnet-20240620, Key: transcription4
Prompt: one-example_prompt, Model: claude-3-5-sonnet-20240620, Key: transcription5
Prompt: one-example_prompt, Model: claude-3-5-sonnet-20240620, Key: transcription17
P

In [27]:
bleu_llms_df = pd.DataFrame(bleu_llms)
bleu_llms_df

Unnamed: 0_level_0,one-example_prompt,one-example_prompt,two-example_prompt,two-example_prompt,zero-shot_complex-prompt,zero-shot_complex-prompt,zero-shot_simple-prompt,zero-shot_simple-prompt,refine_complex-prompt,refine_complex-prompt
Unnamed: 0_level_1,claude-3-5-sonnet-20240620,gpt-4o,claude-3-5-sonnet-20240620,gpt-4o,claude-3-5-sonnet-20240620,gpt-4o,claude-3-5-sonnet-20240620,gpt-4o,claude-3-5-sonnet-20240620,gpt-4o
bleu,0.231123,0.318021,0.068243,0.282903,0.090892,0.141787,0.0,0.0,0.168491,0.208964
precisions,"[0.5607843137254902, 0.3110236220472441, 0.197...","[0.6136363636363636, 0.4337899543378995, 0.339...","[0.44324324324324327, 0.1793478260869565, 0.07...","[0.479108635097493, 0.30726256983240224, 0.221...","[0.2516891891891892, 0.11505922165820642, 0.06...","[0.31346578366445915, 0.17035398230088494, 0.1...","[0.32701421800947866, 0.10952380952380952, 0.0...","[0.3103448275862069, 0.06976744186046512, 0.0,...","[0.34177215189873417, 0.2029598308668076, 0.13...","[0.43376623376623374, 0.2552083333333333, 0.15..."
brevity_penalty,0.920947,0.775269,0.611468,1.0,1.0,1.0,0.734874,0.113902,1.0,1.0
length_ratio,0.923913,0.797101,0.67029,1.300725,2.144928,1.641304,0.764493,0.315217,1.717391,1.394928
translation_length,255,220,185,359,592,453,211,87,474,385
reference_length,276,276,276,276,276,276,276,276,276,276


In [28]:
cer_llms_df = pd.DataFrame(cer_llms, index=[0])
cer_llms_df

Unnamed: 0_level_0,one-example_prompt,one-example_prompt,two-example_prompt,two-example_prompt,zero-shot_complex-prompt,zero-shot_complex-prompt,zero-shot_simple-prompt,zero-shot_simple-prompt,refine_complex-prompt,refine_complex-prompt
Unnamed: 0_level_1,claude-3-5-sonnet-20240620,gpt-4o,claude-3-5-sonnet-20240620,gpt-4o,claude-3-5-sonnet-20240620,gpt-4o,claude-3-5-sonnet-20240620,gpt-4o,claude-3-5-sonnet-20240620,gpt-4o
0,0.445004,0.463695,0.553559,0.678648,0.922358,1.298347,0.658519,0.816679,0.737599,1.053918
