# Environment setup

In [2]:
from openai import OpenAI
import os
import re
import pandas as pd
from dotenv import load_dotenv
import requests

In [3]:
import subprocess
import base64
from IPython.display import display, Image
import json
import math

In [4]:
# Parent directory of the current directory
path = os.path.dirname(os.getcwd()) 

# Paths to the folders of example images and transcriptions
image_folder = path+'/data/Archives_LLN_Nivelles_I_1921_REG 5193'
text_folder = path+'/data/transcriptions'

In [5]:
load_dotenv()

True

In [6]:
openai_API_KEY = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=openai_API_KEY)

# Few shot trials (Doesn't work well -- but don't think it is relevant.)

In [6]:
def handle_nan(obj):
    return {k: (None if (isinstance(v, float) and math.isnan(v)) else v) for k, v in obj.items()}


# Example Transcription Data
def read_json_files(folder_path):
    json_data_list = []

    # Process each JSON file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.json'):
            file_path = os.path.join(folder_path, file_name)
            
            # Read JSON data from file
            with open(file_path, 'r', encoding='utf-8') as json_file:
                json_data_pre = json.load(json_file, object_hook=handle_nan)
                json_data =json.dumps(json_data_pre, indent=4, ensure_ascii=False)
                json_data_list.append(json_data)

    return json_data_list

# Folder containing the JSON files
json_folder = os.path.join(path, 'data/json_transcriptions')

# Read all JSON files from the folder
all_json_data = read_json_files(json_folder)

# Print out the loaded JSON data (for demonstration purposes)
for idx, json_data in enumerate(all_json_data):
    print(f"Data from JSON file {idx + 1}:")
    print(json_data)
    print("\n")

Data from JSON file 1:
[
    {
        "N' d'ordre": {
            "Unnamed: 0_level_1": null
        },
        "Date du dépot des déclarations": {
            "Unnamed: 1_level_1": 1921
        },
        "Désignation des personnes décédées ou absentes.": {
            "Nom.": null,
            "Prénoms": null,
            "Domiciles": null,
            "Domiciles.1": null
        },
        "Date du décès ous du judgement d'envoi en possession, en cas d'absence.": {
            "Unnamed: 6_level_1": null
        },
        "Noms, Prénoms et demeures des parties déclarantes.": {
            "Unnamed: 7_level_1": null
        },
        "Droits de succession en ligne collatérale et de mutation en ligne directe.": {
            "Actif. (2)": null,
            "Passif. (2)": null,
            "Restant NET. (2)": null
        },
        "Droit de mutation par déces": {
            "Valeur des immeubles. (2)": null
        },
        "Numéros des déclarations": {
            "Primitives."

In [13]:
# Example Image Data
def read_image_files(folder_path):
    example_paths = []

# Define the regex pattern
    pattern = re.compile(r'^example\d+.jpeg$')
    
    # Process each image file in the folder
    for file_name in os.listdir(folder_path):
        if pattern.match(file_name):
            file_path = os.path.join(folder_path, file_name)
            example_paths.append(file_path)

    return example_paths


In [15]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [17]:
all_images = read_image_files(image_folder)
all_images_encoded = [encode_image for image in all_images]

In [32]:
import requests

def create_example(prompt_example, encoded_img_ex, transcription_ex):
    return [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": prompt_example
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{encoded_img_ex}"
                    }
                },
                {
                    "type": "text",
                    "text": transcription_ex
                }
            ]
        }
    ]

In [33]:
examples = []

prompt_example = "You are a helpful assistant, recreate the table from this handwritten document into a json file, this table contains columns and subcolumns. No PROFESSION column. Copy the texts as they are, do not add any other sentences from you:"

for i in range(len(all_images_encoded)):
    example = create_example(prompt_example, all_images_encoded[i], all_json_data[i])
    examples.append(example)

In [34]:
new_image_encoded = encode_image(image_folder+'/IMG_2024_03_19_11_46_52_911.jpeg')
prompt = "You are a helpful assistant, recreate the table from this handwritten document into a json file, this table contain columns and subcolums. No PROFESSION column. Copy the texts as they are, do not add any other sentences from you:"

In [35]:
User_input = {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": prompt
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{new_image_encoded}"
                    }
                }
            ]
        }

Too many examples? Errors occur with the examples... Should look for a new way? \
Just do with `User_input` as messages and then iterate? 

In [36]:
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai_API_KEY}"
}

payload = {
    "model": "gpt-4o",
    "messages": examples + [User_input],
    "max_tokens": 1000,
    "temperature": 0
}

response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
print(response.json())


{'error': {'message': "Invalid type for 'messages[0]': expected an object, but got an array instead.", 'type': 'invalid_request_error', 'param': 'messages[0]', 'code': 'invalid_type'}}


# With Examples

In [12]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [14]:
def read_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as json_file:
        json_data_pre = json.load(json_file)
        json_data =json.dumps(json_data_pre, indent=4, ensure_ascii=False)
    return json_data

## Give already an example in the beginning

In [16]:
example_image = encode_image(path+'/data/Archives_LLN_Nivelles_I_1921_REG 5193/example2.jpeg')
example2_image = encode_image(path+'/data/Archives_LLN_Nivelles_I_1921_REG 5193/example3.jpeg')

input_image = encode_image(path+'/data/Archives_LLN_Nivelles_I_1921_REG 5193/example1.jpeg')
transcription_json = read_json(os.path.join(path, 'data/json_transcriptions/transcription_ex2.json'))
transcription2_json = read_json(os.path.join(path, 'data/json_transcriptions/transcription_ex3.json'))

                        In this case, add this information under a new key called 'Note' for the deceased person you saw in the previous row. 
                        If the row of the deceased's name does not exist in the previous row, just add this information about service dates under an empty person's json. 
                        
                        Read the table in the image and organize the information in a nested json like the two examples. 
                        Make sure to read the names of the people and the location as well as the dates and the numbers correctly.
                        When you see Arrêté le \d{2} \w+ \d{4}( \w+)? servais, add it to a new key key called 'Note'. 
                        If there is no information of the deceased name but only 'Arrêté le \d{2} \w+ \d{4}( \w+)? servais', add it under an empty name.
                        Do not make up information.
                        

In [30]:
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai_API_KEY}"
}

payload = {
    "model": "gpt-4o",
    "messages":  [
                {
                    "role": "system", 
                    "content": "You are a helpful assistant who can read old handwriting."
                },
                {
                    "role": "user",
                    "content": [
                    {
                        "type": "text",
                        "text": f"""
                        Transcription:
                        ```json
                        {transcription_json}
                        ```
                        
                        The ```json block contains the transcription of the following image organized in a nested json:
                        
                        """
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                        "url": f"data:image/jpeg;base64,{example_image}"
                        }
                    }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                    {
                        "type": "text",
                        "text": """
                        Task:
                        Firstly, look at the layout of the table in the image. It should be the same as the example.
                        
                        Secondly, read the table row by row. Each row should represent information about a deceased person. 
                        Some rows begin with the name of the deceased. Some row only indicates the serviced date in the form of 'Arrêté le \d{2} \w+ \d{4}( \w+)? servais'. 
                        
                        Thirdly, ensure that you read the numbers correctly by comparing the handwritings in the example and also within the new image.
                        'Restant Net.' must be the result of 'Actif.' minus 'Passif.'. 
                        
                        Fourthly, organize the read texts in a nested json like in the example.
                        
                        Lastly, think about what could have read wrongly. Refine it. Indicate the parts you are unsure with an asterisk. 
                        
                        """
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                        "url": f"data:image/jpeg;base64,{input_image}"
                        }
                    }
                    ]
                }
        
                ],
    "max_tokens": 3000,
    "temperature": 0
}

response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

In [31]:
content = response.json()["choices"][0]["message"]["content"]
print(content)

Here is the transcription of the table from the new image, organized in a nested JSON format. I have carefully read the table row by row and ensured the numbers are correct by comparing the handwriting within the image. I have also indicated parts I am unsure about with an asterisk (*).

```json
[
    {
        "N' d'ordre": {
            "Unnamed: 0_level_1": 411
        },
        "Date du dépot des déclarations": {
            "Unnamed: 1_level_1": "sept 9bre"
        },
        "Désignation des personnes décédées ou absentes.": {
            "Nom.": "Fievereent",
            "Prénoms": "Adolphe",
            "Domiciles": "Ophain",
            "Note": NaN
        },
        "Date du décès ous du judgement d'envoi en possession, en cas d'absence.": {
            "Unnamed: 6_level_1": "19 octobre 1919"
        },
        "Noms, Prénoms et demeures des parties déclarantes.": {
            "Unnamed: 7_level_1": "Fievereent Henri & autres"
        },
        "Droits de succession en lign

## Separately

In [16]:
def call(prompt, max_tokens =1000, base64_image=None):
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {openai_API_KEY}"
        } 
        if base64_image:  
            payload = {
                "model": "gpt-4o",
                "messages": [
                {
                    "role": "system", 
                    "content": "You are a helpful assistant who can read old handwriting."},
                {
                    "role": "user",
                    "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                    ]
                }
                ],
                "max_tokens": max_tokens,
                "temperature": 0,
                # "logit_bias": {"1734": -100},
                "response_format": { "type": "json_object" }
            }
        else:
            payload = {
                "model": "gpt-4o",
                "messages": [
                {
                    "role": "user",
                    "content": [
                    {
                        "type": "text",
                        "text": prompt
                    }
                    ]
                }
                ],
                "max_tokens": max_tokens,
                "temperature": 0,
                # "logit_bias": {"1734": -100},
                "response_format":{ "type": "json_object" }
            }
        
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        # return response.json()["choices"][0]["message"]["content"]
        return response


In [157]:
def makeDraft(image_path, output_format = "json"):
    base64_image = encode_image(image_path)
    
    prompt = f"""
        Structure:
                "N' d'ordre":
                ,
                "Date du dépot des déclarations": 
                ,
                "Désignation des personnes décédées ou absentes.": 
                    "Nom.":  ,
                    "Prénoms":  ,
                    "Domiciles": 
                ,
                "Date du décès ous du judgement d'envoi en possession, en cas d'absence.": 
                ,
                "Noms, Prénoms et demeures des parties déclarantes.": 
                ,
                "Droits de succession en ligne collatérale et de mutation en ligne directe.": 
                    "Actif. (2)": ,
                    "Passif. (2)": ,
                    "Restant NET. (2)": 
                ,
                "Droit de mutation par déces": 
                    "Valeur des immeubles. (2)": 
                ,
                "Numéros des déclarations": 
                    "Primitives.": ,
                    "Supplémentaires.": 
                ,
                "Date": 
                    "de l'expiration du délai de rectification.": ,
                    "de l'exigibilité des droits.": 
                ,
                "Numéros de la consignation des droits au sommier n' 28": 
                ,
                "Recette des droits et amendes.": 
                    "Date": ,
                    "N^03": 
                ,
                "Cautionnements. ": 
                    "Numéros de la consignation au sommier n'30":
                ,
                "Observations (les déclarations qui figurent à l'état n'413 doivent être émargées en conséquence, dans la présnete colonne.)": 
 
        
        Notations:
            7bre or 7b == September
            8bre or 8b == October
            9bre or 9b == November
            Dbre or Db == December
            d or " (quotation mark) = ditto    
        
        Tips: 
        The table in the image is a Déclaration de succession of Belgium. 
        Each deceased person should have information in the following structure below.
        Some notations for dates and others are used as indicated in Notations, but return the notations as they are seen, not what they mean.
        Information for 'Actif.', 'Passif.' and 'Restant Net.' should exist for each dead person. So, read them well from the image.
        'Restant Net.' is the result of 'Actif.' minus 'Passif.'.
        
        Task: 
        Please recreate the table in the image as a {output_format} file based on this structure.
        When you see Arrêté le \d{2} \w+ \d{4}( \w+)? servais, add it to a new key key called 'Note'. 
        If there is no information of the deceased name but only 'Arrêté le \d{2} \w+ \d{4}( \w+)? servais', add it under an empty name.
        Make sure to read the names of the people and the location as well as the dates and the numbers correctly.
        Do not make up information. 
    """
    return call(prompt, max_tokens=3000, base64_image = base64_image), base64_image

In [134]:
draft, encoded_image = makeDraft(image_path = path+'/data/Archives_LLN_Nivelles_I_1921_REG 5193/example1.jpeg')

In [135]:
draft_json = draft.json()["choices"][0]["message"]["content"]

In [136]:
print(draft_json)

{
    "entries": [
        {
            "N' d'ordre": "6",
            "Date du dépot des déclarations": "1897 8bre",
            "Désignation des personnes décédées ou absentes": {
                "Nom": "",
                "Prénoms": "",
                "Domiciles": ""
            },
            "Date du décès ous du judgement d'envoi en possession, en cas d'absence": "Arrêté le 10 8bre 1897 servais",
            "Noms, Prénoms et demeures des parties déclarantes": "",
            "Droits de succession en ligne collatérale et de mutation en ligne directe": {
                "Actif. (2)": "",
                "Passif. (2)": "",
                "Restant NET. (2)": ""
            },
            "Droit de mutation par déces": {
                "Valeur des immeubles. (2)": ""
            },
            "Numéros des déclarations": {
                "Primitives": "",
                "Supplémentaires": ""
            },
            "Date": {
                "de l'expiration du délai de recti

In [126]:
 def refineLayout(draft, json_path):
        
    all_json_data = read_json(json_path)   
 
    prompt = f"""
        
        Your first draft:
        ```draft
        {draft}
        ```
        
        Example:
        ```json
        {all_json_data}
        ```  

        Errors: 
        Your first draft in the ```draft block contains some errors. 
        
        Context: 
        The content of your draft in ```draft block should follow the structure in the example in the ```json block. 
        Information for 'Actif.', 'Passif.' and 'Restant Net.' should exist for each dead person. So, read them well from the image.
        'Restant Net.' is 'Actif.' - 'Passif.'
        
        Task:
        Refine your first draft based on the example and the image.
        When you see Arrêté le \d{2} \w+ \d{4}( \w+)? servais, add it to a new key key called 'Note'. 
        If there is no information of the deceased name but only 'Arrêté le \d{2} \w+ \d{4}( \w+)? servais', add it under an empty name.
        Make sure to read the names of the people and the location as well as the dates and the numbers correctly.
        Do not make up information. 
        
        """
    return call(prompt, max_tokens = 3000, base64_image = encoded_image)

In [128]:
refine = refineLayout(draft_json, json_path = os.path.join(path, 'data/json_transcriptions/transcription_ex2.json'))

In [129]:
refine_json = refine.json()["choices"][0]["message"]["content"]
print(refine_json)

       {
    "entries": [
        {
            "N' d'ordre": "6",
            "Date du dépot des déclarations": "1897 8bre",
            "Désignation des personnes décédées ou absentes": {
                "Nom": "",
                "Prénoms": "",
                "Domiciles": ""
            },
            "Date du décès ous du jugement d'envoi en possession, en cas d'absence": "",
            "Noms, Prénoms et demeures des parties déclarantes": "",
            "Droits de succession en ligne collatérale et de mutation en ligne directe": {
                "Actif. (2)": "",
                "Passif. (2)": "",
                "Restant NET. (2)": ""
            },
            "Droit de mutation par décès": {
                "Valeur des immeubles. (2)": ""
            },
            "Numéros des déclarations": {
                "Primitives.": "",
                "Supplémentaires.": ""
            },
            "Date": {
                "de l'expiration du délai de rectification": "",
       

In [70]:
def checkNames(content, website_name, base64_image):
    prompt = f"""
    Your first draft:
    {content}
    
    Website:
    {website_name}
    
    Task:
    There are some transcription errors in 'Nom', 'Prénoms', and 'Noms, Prénoms et demeures des parties déclarantes' in your first draft.
    Read these items from the image again such that the corresponding names exist in Belgium according to the website.
    When you see Arrêté le \d{2} \w+ \d{4}( \w+)? servais, add it to a new key key called 'Note'. 
    If there is no information of the deceased name but only 'Arrêté le \d{2} \w+ \d{4}( \w+)? servais', add it under an empty name.
    Make sure to read the names of the people and the location as well as the dates and the numbers correctly.
    Only update those items. 
    Do not make up information.
    
    Tips:
    The family name in 'Nom' under 'Désignation des personnes décédées ou absentes.' may equal to the family name in 'Noms, Prénoms et demeures des parties déclarantes', which contains the family and first name of the declaring parties.
    But it is most likely that their first names (Prénoms) are different. 
    If the family names in 'Nom' and 'Noms, Prénoms et demeures des parties déclarantes' are the same, 'Prénoms' should be masculine.
    'Noms, Prénoms et demeures des parties déclarantes' may end with '& autre' or '& autres'.
   
    """
    return call(prompt, max_tokens=3000, base64_image=base64_image)

In [72]:
checkNames = checkNames(refine_json, "https://nl.geneanet.org/genealogie/", encoded_image)

In [73]:
checkNames_json = checkNames.json()["choices"][0]["message"]["content"]

In [74]:
print(checkNames_json )


    {
    "entries": [
        {
            "N' d'ordre": "6",
            "Date du dépot des déclarations": "1897 8bre",
            "Désignation des personnes décédées ou absentes.": {
                "Nom.": "",
                "Prénoms": "",
                "Domiciles": ""
            },
            "Date du décès ous du jugement d'envoi en possession, en cas d'absence.": "Arrêté le 10 7bre 1897 servais",
            "Noms, Prénoms et demeures des parties déclarantes.": "",
            "Droits de succession en ligne collatérale et de mutation en ligne directe.": {
                "Actif. (2)": "",
                "Passif. (2)": "",
                "Restant NET. (2)": ""
            },
            "Droit de mutation par déces": {
                "Valeur des immeubles. (2)": ""
            },
            "Numéros des déclarations": {
                "Primitives.": "",
                "Supplémentaires.": ""
            },
            "Date": {
                "de l'expiration du dél

There are still some errors in 'Domiciles'.
    Check whether 'Domiciles' in your draft in the ```json block exist in Sector_{lang} of the province data in the ```csv_1 block where Commune_{lang} is {municipality}.
    If not, consider the sectors in the province data.
    
    If you do not find anything that resembles the texts in the image in the province data, you can write it as you see but with an asterisk.
    
    Update 'Domiciles' in your draft in the ```json block.
    Do not make up for missing information.
    
    Tips:
    The sector name written in 'Domiciles' in the image may have similar name but not exactly the same as the sector name in the province data in ```csv block. 

In [94]:
def checkCities(content, country, province, municipality, location_path, base64_image, language="French", lang="FR"):
    txt = pd.read_csv(location_path, sep='\t')
    province = txt[txt['Province'] == {province}].copy()

    prompt = f"""
    
    Your draft:
     ```json 
    {content}
    ```
    
    Province data:
    ```txt
    {province}
    ```
    
    Task:
    There may be errors in the information you filled in 'Domiciles' in your draft in ```json block. Refine it.
    To improve 'Domiciles' in your draft, consult Sector_{lang} in the province data in ```txt block and the image. 
    When you see Arrêté le \d{2} \w+ \d{4}( \w+)? servais, add it to a new key key called 'Note'. 
    If there is no information of the deceased name but only 'Arrêté le \d{2} \w+ \d{4}( \w+)? servais', add it under an empty name.
    Make sure to read the names of the people and the location as well as the dates and the numbers correctly.
    
    Tips:
    The sector names in your draft and the province data may slightly differ.
    'Domiciles' in your draft should contain sector names in Sector_{lang}.

    
    """
    return call(prompt, max_tokens=3000, base64_image=base64_image)

In [96]:
location_path = os.path.join(path,'data_rag/BE_location_full.txt')

checkCities = checkCities(content = checkNames, country = "Belgium", province = "Brabant wallon", 
                          municipality = "Nivelles", location_path = location_path, 
                          base64_image = encoded_image, 
                          language = "French", lang="FR")

In [97]:
checkCities_json = checkCities.json()["choices"][0]["message"]["content"]

In [98]:
print(checkCities_json)


    {
      "Response": "[200]",
      "Domiciles": [
        {
          "Name": "Leboeuf",
          "Date of Decree": "1897",
          "Sector_FR": "Herve",
          "Note": "Arrêté le vingt-neuf octobre 1897 servais"
        },
        {
          "Name": "Lefevre",
          "Date of Decree": "1897",
          "Sector_FR": "Herve",
          "Note": "Arrêté le vingt-neuf octobre 1897 servais"
        },
        {
          "Name": "Desmet",
          "Date of Decree": "1897",
          "Sector_FR": "Bruxelles",
          "Note": "Arrêté le deux novembre 1897 servais"
        },
        {
          "Name": "Monseu",
          "Date of Decree": "1897",
          "Sector_FR": "Bruxelles",
          "Note": "Arrêté le deux novembre 1897 servais"
        },
        {
          "Name": "Bouly",
          "Date of Decree": "1897",
          "Sector_FR": "Bruxelles",
          "Note": "Arrêté le deux novembre 1897 servais"
        },
        {
          "Name": "Godart",
          "Dat

😢 Having troubles with the domiciles & numbers

# Try with iteration & examples