<a target="_blank" href="https://colab.research.google.com/github/jmanuelc87/nmp-autoavanza/blob/main/notebooks/MontePiedad_Extraction.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [21]:
import os
import csv
import glob
import json
import math
import torch
import base64
import jsonlines
import concurrent.futures
import threading
import numpy as np
import pytesseract as pyt
import matplotlib.pyplot as plt

from tqdm import tqdm

from pydantic import BaseModel, Field

from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain.chains import TransformChain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

from langchain.chains.question_answering import load_qa_chain


## Extract Information using ChatOpenAI

In [None]:
if "OPENAI_API_KEY" not in os.environ:
    os.environ['OPENAI_ORG_ID'] = '***'
    os.environ['OPENAI_PROJECT_ID'] = '***'
    os.environ['OPENAI_API_KEY'] = '***'

In [23]:
llm = ChatOpenAI(
    model = "gpt-4.1-mini",
    temperature=0,
    # base_url='http://localhost:1234/v1',
)

In [24]:
extract = [
    {
        "role": "system",
        "content": "Eres un asistente lector de documentos servicial, usando OCR extraes los campos de la imagen, tus respuestas son concisas, respondes en formato json, y si no sabes el valor de un atributo retorna null para el valor del atributo",
    },
    {
        "role": "user",
        "content": [
            {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,{image}"}},
        ],
    },
]

In [25]:
normalize = [
    {
        "role": "system",
        "content": "Eres un asistente servicial, recibes una lista de objetos json y normalizas los objetos de forma que todos quedan con igual numero de atributos, tus respuestas son concisas, respondes en formato json, y si no sabes el valor de un atributo retorna null para el valor del atributo"
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "{json_str}"}
        ]
    }
]

In [26]:
prompt_extraction = ChatPromptTemplate(messages=extract)
prompt_normalize = ChatPromptTemplate(messages=normalize)

In [27]:
def load_image(inputs):
    """Load image from file and encode it as base64."""
    image_path = inputs["image_path"]
  
    def encode_image(image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    image_base64 = encode_image(image_path)
    return {"image": image_base64}

In [28]:
def dict_to_str(inputs):
    json_str = json.dumps(inputs)

    return {"json_str": json_str}

In [29]:
load_image_chain = TransformChain(
    input_variables=["image_path"], output_variables=["image"], transform=load_image
)

load_json_str_chain = TransformChain(
    input_variables=["output"], output_variables=["json_str"], transform=dict_to_str
)

In [30]:
parser = SimpleJsonOutputParser()

In [31]:
extract_chain = load_image_chain | prompt_extraction | llm | parser

In [32]:
normalize_chain = prompt_normalize | llm | parser

In [33]:
image_path = "./data/bronze/BASE_AUTOAVANZA/documentos_clean/FAC_FRENTE/Caso 1_TK 62853-1 FAC_FRENTE_otsu.jpg"

In [34]:
response1 = extract_chain.invoke(input={"image_path": image_path})

In [35]:
import pprint

pprint.pprint(response1)

{'aduana': 'LAZARO CARDEI',
 'capacidad': '5',
 'celular_cliente': '3414145112',
 'clase': 'CAMIONETA',
 'clave_producto': '25101500',
 'clave_unidad': 'C62',
 'clave_vehicular': '1520601',
 'cliente': 'RODRIGUEZ ELIZONDO FRANCISCO',
 'codigo_postal': '49600',
 'combustible': 'GASOLINA',
 'condiciones_pago': 'TFS',
 'descripcion': 'UN VEHICULO NUEVO 7495 TOYOTA HILUX DOB CAB SR MODELO 2019 '
                'MOTOR: 2.7 LTS. TRANSMISION: MANUAL ORIGEN: IMPORTADA PEDIDO: '
                '314 COLOR EXT. PLATA COLOR INT. NEGRO HILUX DOBLE CABINA SR, '
                'TRANSMISION MANUAL DE 5 VELOCIDADES, CAP CARGA 820 KG, MOTOR '
                '4 CILINDROS, VVT-I DUAL, 2.7 LTS, 166 HP, 16 VALVULAS, RINES '
                'DE ALUMINIO DE 17", AIRE ACONDICIONADO MANUAL, VESTIDURAS EN '
                'TELA, FAROS DE NIEBLA, SISTEMA DE AUDIO RADIO AM/FM/CD/BT/USB '
                'CON CAPACIDAD PARA LEER MP3 Y WMA; MINIJACK CON 4 BOCINAS, '
                'BOLSAS DE AIRE FRONTALES DE 

In [36]:
image_path = "./data/bronze/BASE_AUTOAVANZA/documentos_clean/FAC_FRENTE/Caso 2_TK 63075-1 FAC_FRENTE_otsu.jpg"

In [37]:
response2 = extract_chain.invoke(input={"image_path": image_path})

In [38]:
import pprint

pprint.pprint(response2)

{'Conceptos': [{'Cantidad': 1,
                'ClaveUnidad': 'XVN',
                'ClaveVehicular': '0981407',
                'CodigoSAT': '25101500',
                'Descripcion': 'AUTO NUEVO MGMARCA MG ZS SUV COM EXC VERSION '
                               '1.5L COM EXCITE AT, 1.5 LTS., AUTOMATICO, 4 '
                               'CIL. &10;DISTRIBUIDOR MG SENDERO &10, COLOR '
                               'EXTERIOR AZUL LASER COLOR INTERIOR NEGRO &10, '
                               'COLOR INTERIOR',
                'Descuento': 0.0,
                'Importe': 366293.1,
                'Impuestos': {'IVA': 58606.9},
                'Pedimento': '22 51 1669 2005344',
                'ValorUnitario': 366293.1,
                'Version': '1.1'}],
 'CondicionesPago': {'CondicionesPago': 'CONTADO',
                     'FormaPago': '03 - Transferencia electrónica de fondos',
                     'MetodoPago': 'PPD - Pago en parcialidades o diferido'},
 'Emisor': {'Domicilio': '

In [39]:
result = json.dumps([response1, response2])
result

'[{"cliente": "RODRIGUEZ ELIZONDO FRANCISCO", "direccion_cliente": "HIDALGO PONIENTE 473, ZAPOTILTIC CENTRO, ZAPOTILTIC, JALISCO, MEXICO", "telefono_cliente": "4145112", "celular_cliente": "3414145112", "codigo_postal": "49600", "factura": "GFU000000343", "fecha": "2019-10-01T17:39:5", "rfc": "ROEF-690604-6G3", "numero_inventario": "24108", "numero_serie": "MROEX8DD2K0186450", "condiciones_pago": "TFS", "procedencia": "TOYOTA TAILANDIA", "numero_pedimento_importacion": "195137889002692", "marca": "TOYOTA", "linea": "HILUX", "modelo": "2019", "clase": "CAMIONETA", "tipo": "HILUX DOB CAB SR", "clave_vehicular": "1520601", "numero_puertas": "4", "numero_cilindros": "4", "capacidad": "5", "combustible": "GASOLINA", "motor": "2TRA601886", "aduana": "LAZARO CARDEI", "fecha_aduana": "07/06/2019", "descripcion": "UN VEHICULO NUEVO 7495 TOYOTA HILUX DOB CAB SR MODELO 2019 MOTOR: 2.7 LTS. TRANSMISION: MANUAL ORIGEN: IMPORTADA PEDIDO: 314 COLOR EXT. PLATA COLOR INT. NEGRO HILUX DOBLE CABINA SR, T

In [40]:
response = normalize_chain.invoke(input={"json_str": result})

In [41]:
import pprint

pprint.pprint(response)

[{'Conceptos': None,
  'CondicionesPago_CondicionesPago': None,
  'CondicionesPago_FormaPago': None,
  'CondicionesPago_MetodoPago': None,
  'Emisor_Domicilio': None,
  'Emisor_Nombre': None,
  'Emisor_RFC': None,
  'Emisor_Regimen': None,
  'Factura_FechaHoraEmision': None,
  'Factura_Folio': None,
  'Factura_LugarExpedicion': None,
  'Factura_TipoRelacion': None,
  'Factura_UUID': None,
  'Observaciones': None,
  'Receptor_Domicilio': None,
  'Receptor_Nombre': None,
  'Receptor_RFC': None,
  'Receptor_UsoCFDI': None,
  'TimbreFiscal_FechaHoraCertificacion': None,
  'TimbreFiscal_FolioFiscal': None,
  'TimbreFiscal_NoCertificadoSAT': None,
  'TimbreFiscal_RFC_PAC': None,
  'Totales_Descuento': None,
  'Totales_IVA': None,
  'Totales_SubTotal': None,
  'Totales_Total': None,
  'Totales_TotalLetra': None,
  'Vehiculo_Capacidad': None,
  'Vehiculo_Clase': None,
  'Vehiculo_ClaveVehicular': None,
  'Vehiculo_Combustible': None,
  'Vehiculo_Linea': None,
  'Vehiculo_Marca': None,
  'Vehic

In [42]:
csv_file = "./data/bronze/documents/vehicles.jsonl"
invoices = glob.glob("./data/bronze/BASE_AUTOAVANZA/documentos_clean/FAC_FRENTE/*_otsu.jpg")

In [45]:
def extract_documents(documents, out_file, chain_extraction, chain_normalize, max_workers=3):
    t = tqdm(total=len(documents))

    def process_document(doc):
        response = chain_extraction.invoke(input={"image_path": doc})
        t.update()
        return response

    def write_document(data):
        try:
            with jsonlines.open(out_file, mode='a') as writer:
                writer.write_all(data)
        except Exception as e:
            print(f"Error {e} in {data}")

    result_list = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        while len(documents) > 0:
            doc = documents.pop()
            future = executor.submit(process_document, doc)
            result_list.append(future.result())

    result = chain_normalize.invoke(input={"json_str": json.dumps(result_list)})
    write_document(result)

In [44]:
extract_documents(invoices, csv_file, extract_chain, normalize_chain)

100%|██████████| 37/37 [15:56<00:00, 25.86s/it]


In [46]:
csv_file = "./data/bronze/documents/credenciales.jsonl"
personal_ids = glob.glob("./data/bronze/BASE_AUTOAVANZA/documentos_clean/INE_FRENTE/*_adaptativo.jpg")

In [47]:
extract_documents(personal_ids, csv_file, extract_chain, normalize_chain)

100%|██████████| 36/36 [09:26<00:00, 15.73s/it]


In [49]:
csv_file = "./data/bronze/documents/tarjetas_circulacion.jsonl"
tcs = glob.glob("./data/bronze/BASE_AUTOAVANZA/documentos_clean/TC_FRENTE/*_adaptativo.jpg")

In [50]:
extract_documents(tcs, csv_file, extract_chain, normalize_chain)

100%|██████████| 35/35 [13:17<00:00, 22.79s/it]
