In [1]:
from openai import OpenAI
import os
import re
import pandas as pd
from dotenv import load_dotenv
import requests
import base64
import subprocess
from IPython.display import display, Image
from PIL import Image as PILImage

In [2]:
import cv2
# import easyocr
import matplotlib.pyplot as plt
# import pytesseract
# import keras_ocr

In [3]:
# OpenAI
load_dotenv() #get the environment 
openai_API_KEY = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=openai_API_KEY)

In [4]:
path = os.path.dirname(os.getcwd()) # Parent directory
ouput_folder = path+'/results/predictions'

In [5]:
def get_outputs(output_path):   
    outputs = {}
    for output in os.listdir(output_path):
        # Check if the current item is a file and ends with '.txt'
        file_path = os.path.join(output_path, output)
        if os.path.isfile(file_path) and output.endswith('.txt'):
            filename = output.split('.')[0]
            # Open the individual file and read its content
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            outputs[filename] = text  # Store the filename and text in the dictionary
    return outputs

In [6]:
models = ['claude-3-5-sonnet-20240620', 'EasyOCR', 'gpt-4o', 'KerasOCR', 'Pytesseract', 'trOCR']
prompts = ['one-example_prompt', 'two-example_prompt', 'zero-shot_complex-prompt', 'zero-shot_simple-prompt',
                'refine_complex-prompt']
outputs = {}
for prompt in prompts:
    outputs[prompt] = {}  # Initialize the dictionary for each prompt
    for model in models:
        output_path = f'{ouput_folder}/{prompt}/{model}'
        outputs[prompt][model] = get_outputs(output_path)

In [47]:
outputs.keys()

dict_keys(['one-example_prompt', 'two-example_prompt', 'zero-shot_complex-prompt', 'zero-shot_simple-prompt', 'refine_complex-prompt'])

In [25]:
def callPostProcessing(max_tokens=800, prompt_parameter = None):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_API_KEY}"
    } 
    payload = {
        "model": "gpt-4o",
        "messages": [
        {
            "role": "user",
            "content": [
            {
                "type": "text",
                "text": f"""This is an output from you. Clean it such that we only have the table without any separators, no comment from you: {prompt_parameter}
                """
            }
            ]
        }
        ],
        "max_tokens": max_tokens,
        "temperature": 0
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    try:
        return response.json()["choices"][0]["message"]["content"]
    except:
        print(response.json()["error"]["message"])

In [53]:
print(outputs[prompt][model]['transcription1'])

Here's a recreation of the content of the table in the image:

| N° | DATE DU DÉCÈS | DÉSIGNATION DES PERSONNES DÉCÉDÉES OU ABSENTES | DATE DE DÉCÈS ou de la disparition | NOMS, PRÉNOMS des héritiers ou légataires | DROITS DE SUCCESSION | DROIT DE MUTATION | NUMÉROS du sommier | DATE | RECETTE | OBSERVATIONS |
|---|---------------|------------------------------------------------|-----------------------------------|-------------------------------------------|----------------------|---------------------|---------------------|------|---------|---------------|
| | | NOM | PRÉNOMS | DOMICILE | | | ACTIF | PASSIF | RESTANT NET | VALEUR | de l'expiration du délai de rectification | de l'exigibilité des droits | DATE | N° | |
| 403 | quatre 9bre | Payot | Antoinette | Braine l'Alleud | 14 mai 1919 | Payot Marie & autres | 16975 | 2950 | 11740 | 2011/1921 | 16 Db 1919 | 14 mars 1928 | | | |
| 403² | d° | Paulus | Mélanie | Nivelles | 28 Janvier 1919 | Paulus François | 1971 | | 1971 | 191/1919 

In [54]:
# Immediately save the post-processed outputs

# for prompt in prompts:
#     for model in models:
#         for key in outputs[prompt][model].keys():
#             post = callPostProcessing(prompt_parameter=outputs[prompt][model][key])
#             output_dir = os.path.join(path, 'results', 'postprocessed', prompt, model)
#             os.makedirs(output_dir, exist_ok=True)
#             file_path = os.path.join(output_dir, f"{key}.txt")
#             with open(file_path, 'w', encoding='utf-8') as file:
#                 file.write(post)

# Time: 59m 4.8s

* trOCR's outputs didn't get postprocessed well. Stil separators exsit. 


# New BLEU Scores

In [7]:
from evaluate import load

cer_metric =load("cer")
bleu_metric = load("bleu")  

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
models = ['claude-3-5-sonnet-20240620', 'gpt-4o']
# models = ['claude-3-5-sonnet-20240620', 'EasyOCR', 'gpt-4o', 'KerasOCR', 'Pytesseract', 'trOCR']

prompts = ['one-example_prompt', 'two-example_prompt', 'zero-shot_complex-prompt', 'zero-shot_simple-prompt',
                'refine_complex-prompt']
predictions = {}
for prompt in prompts:
    predictions[prompt] = {}  
    for model in models:
        predictions_path = f'{path}/results/postprocessed/{prompt}/{model}'
        predictions[prompt][model] = get_outputs(predictions_path)

In [9]:
# We should disregard the transcriptions that were used as examples!! 

for prompt in ['one-example_prompt', 'two-example_prompt']:
    if predictions.get(prompt):
        for model in models:
            if prompt == 'one-example_prompt':
                predictions[prompt][model].pop('transcription2', None)  # Remove only 'transcription2'
            elif prompt == 'two-example_prompt':
                for key in ['transcription2', 'transcription3']:
                    predictions[prompt][model].pop(key, None)  # Remove both keys


In [14]:
len(predictions['one-example_prompt']['claude-3-5-sonnet-20240620'])

19

In [10]:
references = {}

for ref in os.listdir(f'{path}/data/transcriptions'):
    ref_path = f'{path}/data/transcriptions/{ref}'
    if ref.endswith('.txt'):
        name = ref.split('.')[0]
        name = int(name.split('ex')[1])-1
        ref = f"transcription{name}"
        # print(ref)
        with open(ref_path, 'r', encoding='utf-8') as f:
            text = f.read()
        references[ref] = text

In [26]:
bleu_llms = {}
cer_llms = {}
bleu_llms_average = {}
cer_llms_average = {}

for prompt in prompts:
    for model in models:
        for key in predictions[prompt][model].keys():
            bleu_llms[prompt, model, key] = bleu_metric.compute(predictions=[predictions[prompt][model][key]], references=[references[key]])
            cer_llms[prompt, model, key] = cer_metric.compute(predictions=[predictions[prompt][model][key]], references=[references[key]])
    

In [39]:
bleu_llms = pd.DataFrame(bleu_llms).T
bleu_llms = bleu_llms['bleu'].unstack()

In [43]:
bleu_llms['average'] = bleu_llms.mean(axis=1)

In [44]:
bleu_llms

Unnamed: 0,Unnamed: 1,transcription0,transcription1,transcription10,transcription11,transcription12,transcription13,transcription14,transcription15,transcription16,transcription17,...,transcription19,transcription2,transcription3,transcription4,transcription5,transcription6,transcription7,transcription8,transcription9,average
one-example_prompt,claude-3-5-sonnet-20240620,0.076907,0.678195,0.196808,0.097355,0.025345,0.231123,0.262197,0.326152,0.200285,0.140632,...,0.266215,,0.183511,0.165615,0.074787,0.057551,0.081666,0.0,0.146495,0.174437
one-example_prompt,gpt-4o,0.302808,1.0,0.456089,0.311661,0.156751,0.318021,0.44961,0.306383,0.323653,0.328432,...,0.363018,,0.402259,0.0,0.326621,0.0,0.255205,0.346939,0.389838,0.332891
refine_complex-prompt,claude-3-5-sonnet-20240620,0.329939,0.296044,0.032363,0.064105,0.101585,0.168491,0.220907,0.17581,0.094841,0.092384,...,0.164237,0.276143,0.035362,0.07494,0.0,0.028389,0.217679,0.361945,0.079808,0.151278
refine_complex-prompt,gpt-4o,0.181182,0.0,0.195539,0.197382,0.173212,0.208964,0.139178,0.198561,0.20145,0.146682,...,0.259731,0.15853,0.216783,0.13538,0.0,0.176987,0.127598,0.271618,0.078715,0.162181
two-example_prompt,claude-3-5-sonnet-20240620,0.071862,0.678195,0.126845,0.104453,0.038291,0.068243,0.060846,0.31032,0.205084,0.20787,...,0.271123,,,0.162391,0.118998,0.229417,0.156653,0.063755,0.193816,0.184972
two-example_prompt,gpt-4o,0.139141,1.0,0.3438,0.34874,0.210022,0.282903,0.219111,0.292666,0.349347,0.301275,...,0.369653,,,0.029404,0.247475,0.274022,0.275535,0.498971,0.426922,0.324977
zero-shot_complex-prompt,claude-3-5-sonnet-20240620,0.168696,0.139838,0.078016,0.089022,0.158498,0.090892,0.133556,0.12347,0.099217,0.102198,...,0.126708,0.104671,0.058255,0.089094,0.099308,0.137686,0.101512,0.156557,0.110608,0.114848
zero-shot_complex-prompt,gpt-4o,0.029235,0.14506,0.116864,0.089905,0.119193,0.141787,0.135815,0.061454,0.149005,0.146682,...,0.254539,0.093458,0.170878,0.084638,0.0,0.159181,0.150879,0.293719,0.213291,0.138141
zero-shot_simple-prompt,claude-3-5-sonnet-20240620,0.064063,0.084825,0.078843,0.031119,0.034111,0.0,0.068861,0.042662,0.065101,0.121836,...,0.122282,0.049173,0.060101,0.0,0.060394,0.132592,0.069776,0.132598,0.022732,0.066389
zero-shot_simple-prompt,gpt-4o,0.0,0.0,0.01103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048163,0.00296


In [16]:
for model in models:
    average_bleu = sum(bleu_llms_average[model].values())/len(bleu_llms_average)

    

KeyError: 'claude-3-5-sonnet-20240620'

In [28]:
cer_llms_df = pd.DataFrame(cer_llms, index=[0])
cer_llms_df

Unnamed: 0_level_0,one-example_prompt,one-example_prompt,two-example_prompt,two-example_prompt,zero-shot_complex-prompt,zero-shot_complex-prompt,zero-shot_simple-prompt,zero-shot_simple-prompt,refine_complex-prompt,refine_complex-prompt
Unnamed: 0_level_1,claude-3-5-sonnet-20240620,gpt-4o,claude-3-5-sonnet-20240620,gpt-4o,claude-3-5-sonnet-20240620,gpt-4o,claude-3-5-sonnet-20240620,gpt-4o,claude-3-5-sonnet-20240620,gpt-4o
0,0.445004,0.463695,0.553559,0.678648,0.922358,1.298347,0.658519,0.816679,0.737599,1.053918
