In [80]:
import requests
import pandas as pd
import numpy as np
import os
from typing import List
ocr_endpoint = "http://localhost:8901/txt/blocks-words" # url de l'OCR
test_image_path = "../data/raw/facture_test.png" 
data_path = "../data/" # chemin du dossier data

In [81]:
def get_full_text(image: str) -> str:
    """Envoi une image à l'API d'océrisation et retourne le texte."""
    with open(image, "rb") as file:
        files = {"file": file}
        response = requests.post(ocr_endpoint, files=files)
        return response.text


def get_processed_dataset(path_to_dataset: str) -> pd.DataFrame:
    """Récupère ou crée le dataset contenant le texte océrisé"""
    
    if os.path.isfile(path_to_dataset):
        processed_dataset = pd.read_csv(path_to_dataset)
    else:
        processed_dataset = pd.DataFrame(columns=['filename','new_type','original_type','motif_rejet','true_cat','inclusion_dataset','excluded_types','grouped_type', 'full_text', 'cleaned_text'], dtype="object")
        processed_dataset.to_csv(path_to_dataset, index=None)
    return processed_dataset


def get_new_images_to_ocerize(raw_dataset: pd.DataFrame, processed_dataset: pd.DataFrame) -> List[str]:
    """Compare les deux fichiers et renvoi seulement les images qui ne sont pas déjà océrisées."""
    
    filenames = np.setdiff1d(raw_dataset["filename"].values, processed_dataset["filename"].values)

    return raw_dataset[raw_dataset["filename"].isin(filenames)]

def save_text_to_file(text:str, path: str):
    """Enregistre le texte océrisé dans un fichier .txt"""
    with open(path, "w") as txt_file:
        txt_file.write(text)


In [82]:
path_to_dataset = f"{data_path}/processed/processed_dataset.csv"
raw_dataset = pd.read_csv(f"{data_path}dataset.csv")
processed_dataset = get_processed_dataset(path_to_dataset)

new_images = get_new_images_to_ocerize(raw_dataset, processed_dataset)

for _, row in new_images.head(10).iterrows():                                               # TODO : Attention j'ai mis head(10) pour tester, à enlever
    full_text = get_full_text(f"{data_path}raw/final/{row.filename}")

    text_file_path = f"{data_path}/processed/{row.filename}.txt"
    save_text_to_file(full_text, text_file_path)
    new_images.loc[new_images['filename']==row.filename, 'full_text'] = text_file_path 

processed_dataset = pd.concat([processed_dataset, new_images.head(10)],ignore_index=True)   # TODO : Attention j'ai mis head(10) pour tester, à enlever
processed_dataset.to_csv(path_to_dataset, index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_images.loc[new_images['filename']==row.filename, 'full_text'] = text_file_path


In [10]:
result = get_full_text(test_image_path)
print(result)

<Response [200]>


In [None]:
# save text to csv

In [7]:
clean_url = "http://localhost:8903/clean" # url du cleaner/tokenizer
def get_cleaned_text(text) -> str:
    print(clean_url)
    headers = {'Content-Type': 'text/plain'}
    params = {
        "text": text
    }
    response = requests.post(clean_url, params=params, headers=headers)
    return response

In [71]:
get_cleaned_text(result.text).text

http://localhost:8903/clean


"factur logo vendeur entreprise 22 avenue voltair 13000 marseill michel acheteur 31 rue forêt 13100 alx-en-provenc client date facturation numéro facture échéance paiemer 30 jour référence 1436 2.6.2021 143 16.6.2021 information additionnalle service vente garantie an description main - oeuvre produire quantité 10 unité pc prix unitaire ht tva total ttc 00,00 105,00 20 210,00 300,00 260,00 350,00 270,00 620,00 voitare marselll '' siren siret 24567 intrer frxx 999999999 onordtinnnéer pieme fomiseer telephone 99 e-mall pemegmacompagnien wwww macompagnie com petalle bancairen anbueg ban bic sur swi np pariba fr23 4112 4098 23 00hcs"

In [60]:
print(clean_url)

http://localhost:8903/clean
