<a target="_blank" href="https://colab.research.google.com/github/jmanuelc87/nmp-autoavanza/blob/main/notebooks/MontePiedad_Extraction.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
import os
import csv
import glob
import json
import math
import torch
import base64
import jsonlines
import concurrent.futures
import threading
import numpy as np
import pandas as pd
import pytesseract as pyt
import matplotlib.pyplot as plt

from tqdm import tqdm

from pydantic import BaseModel, Field

from langchain.output_parsers.json import SimpleJsonOutputParser
from langchain.chains import TransformChain
from langchain_core.tools import tool
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

## Extract Information using ChatOpenAI

In [251]:
if "OPENAI_API_KEY" not in os.environ:
    os.environ['OPENAI_ORG_ID'] = '***'
    os.environ['OPENAI_PROJECT_ID'] = '***'
    os.environ['OPENAI_API_KEY'] = '***'

In [102]:
llm = ChatOpenAI(
    model = "gpt-4.1-mini",
    temperature=0,
    # base_url='http://localhost:1234/v1',
)

In [None]:
extract = [
    {
        "role": "system",
        "content": "Eres un asistente lector de documentos servicial, usando OCR extraes los campos de la imagen, tus respuestas son concisas, respondes en formato json, y si no identificas el valor de un atributo retorna null para el valor del atributo",
    },
    {
        "role": "user",
        "content": [
            {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,{image}"}},
        ],
    },
]

In [104]:
normalize = [
    {
        "role": "system",
        "content": "Eres un asistente servicial, recibes una lista de objetos json y normalizas los objetos de forma que todos quedan con igual numero de atributos, tus respuestas son concisas, respondes en formato json los atributos identificados, y si no sabes el valor de un atributo retorna null para el valor del atributo"
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Identifica los siguientes attributos: {attributes}"},
            {"type": "text", "text": "{json_str}"}
        ]
    }
]

In [105]:
prompt_extraction = ChatPromptTemplate(messages=extract)
prompt_normalize = ChatPromptTemplate(messages=normalize)

In [106]:
def load_image(inputs):
    """Load image from file and encode it as base64."""
    image_path = inputs["image_path"]
  
    def encode_image(image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')
    image_base64 = encode_image(image_path)
    return {"image": image_base64}

In [107]:
def dict_to_str(inputs):
    json_str = json.dumps(inputs)

    return {"json_str": json_str}

In [108]:
load_image_chain = TransformChain(
    input_variables=["image_path"], output_variables=["image"], transform=load_image
)

load_json_str_chain = TransformChain(
    input_variables=["output"], output_variables=["json_str"], transform=dict_to_str
)

In [109]:
parser = SimpleJsonOutputParser()

In [110]:
extract_chain = load_image_chain | prompt_extraction | llm | parser

In [111]:
normalize_chain = prompt_normalize | llm | parser

In [112]:
image_path = "./data/bronze/BASE_AUTOAVANZA/documentos_clean/FAC_FRENTE/Caso 1_TK 62853-1 FAC_FRENTE_otsu.jpg"

In [113]:
response1 = extract_chain.invoke(input={"image_path": image_path})

In [114]:
import pprint

pprint.pprint(response1)

{'aduana': 'LAZARO CARDEI',
 'capacidad': '5',
 'celular_cliente': '3414145112',
 'clase': 'CAMIONETA',
 'clave_producto': '25101500',
 'clave_unidad': 'C62',
 'clave_vehicular': '1520601',
 'cliente': 'RODRIGUEZ ELIZONDO FRANCISCO',
 'codigo_postal': '49600',
 'combustible': 'GASOLINA',
 'condiciones_pago': 'TFS',
 'descripcion': 'UN VEHICULO NUEVO 7495 TOYOTA HILUX DOB CAB SR MODELO 2019 '
                'MOTOR: 2.7 LTS. TRANSMISION: MANUAL ORIGEN: IMPORTADA COLOR '
                'EXT. PLATA COLOR INT. NEGRO HILUX DOBLE CABINA SR, '
                'TRANSMISION MANUAL DE 5 VELOCIDADES, CAP CARGA 820 KG, MOTOR '
                '4 CILINDROS, VVT-I DUAL, 2.7 LTS, 166 HP, 16 VALVULAS, RINES '
                'DE ALUMINIO DE 17", AIRE ACONDICIONADO MANUAL, VESTIDURAS EN '
                'TELA, FAROS DE NIEBLA, SISTEMA DE AUDIO RADIO AM/FM/CD/BT/USB '
                'CON CAPACIDAD PARA LEER MP3 Y WMA; MINIJACK CON 4 BOCINAS, '
                'BOLSAS DE AIRE FRONTALES DE MULTIPLE ETA

In [115]:
image_path = "./data/bronze/BASE_AUTOAVANZA/documentos_clean/FAC_FRENTE/Caso 2_TK 63075-1 FAC_FRENTE_otsu.jpg"

In [116]:
response2 = extract_chain.invoke(input={"image_path": image_path})

In [117]:
import pprint

pprint.pprint(response2)

{'certificado': '00001000000505147443',
 'cliente': {'direccion': 'CEREZO 632, Col. RESID EBANOS 5TO SECTOR, APODACA, '
                          'NUEVO LEON, Mexico, C.P.: 66612',
             'nombre': 'MARTHA ELENA ESCALANTE LIZCANO',
             'rfc': 'XAXX010101000'},
 'conceptos': {'cantidad': '1',
               'clave_unidad': 'XVN',
               'clave_vehicular': '0981407',
               'codigo_sat': '25101500',
               'descripcion': 'AUTO NUEVO MGMARCA MG ZS SUV COM EXC VERSION '
                              '1.5L COM EXCITE AT, 1.5 LTS., AUTOMATICO, 4 '
                              'CIL. &10;DISTRIBUIDOR MG SENDERO &10, COLOR '
                              'EXTERIOR AZUL LASER COLOR INTERIOR NEGRO &10, '
                              'COLOR INTERIOR',
               'descuento': '0.00',
               'importe': '366,293.10',
               'impuestos': {'iva': '58,606.90'},
               'pedimento': '22 51 1669 2005344',
               'valor_unitario': 

In [118]:
result = json.dumps([response1, response2])
result

'[{"cliente": "RODRIGUEZ ELIZONDO FRANCISCO", "direccion_cliente": "HIDALGO PONIENTE 473, ZAPOTILTIC CENTRO, ZAPOTILTIC, JALISCO, MEXICO", "telefono_cliente": "4145112", "celular_cliente": "3414145112", "codigo_postal": "49600", "factura": "GFU000000343", "fecha": "2019-10-01T17:39:5", "rfc": "ROEF-690604-6G3", "numero_inventario": "24108", "numero_serie": "MROEX8DD2K0186450", "condiciones_pago": "TFS", "procedencia": "TOYOTA TAILANDIA", "numero_pedimento_importacion": "195137889002692", "marca": "TOYOTA", "linea": "HILUX", "modelo": "2019", "clase": "CAMIONETA", "tipo": "HILUX DOB CAB SR", "clave_vehicular": "1520601", "numero_puertas": "4", "numero_cilindros": "4", "capacidad": "5", "combustible": "GASOLINA", "motor": "2TRA601886", "aduana": "LAZARO CARDEI", "fecha_aduana": "07/06/2019", "descripcion": "UN VEHICULO NUEVO 7495 TOYOTA HILUX DOB CAB SR MODELO 2019 MOTOR: 2.7 LTS. TRANSMISION: MANUAL ORIGEN: IMPORTADA COLOR EXT. PLATA COLOR INT. NEGRO HILUX DOBLE CABINA SR, TRANSMISION M

In [119]:
attributos = ",".join(
    ["Nombre del Cliente", "Numero de Serie o NIV", "Marca", "Modelo", "Año", "Version"]
)
response = normalize_chain.invoke(input={"json_str": result, "attributes": attributos})

In [120]:
import pprint

pprint.pprint(response)

[{'Año': '2019',
  'Marca': 'TOYOTA',
  'Modelo': '2019',
  'Nombre del Cliente': 'RODRIGUEZ ELIZONDO FRANCISCO',
  'Numero de Serie o NIV': 'MROEX8DD2K0186450',
  'Version': None},
 {'Año': '2022',
  'Marca': 'MG',
  'Modelo': '2022',
  'Nombre del Cliente': 'MARTHA ELENA ESCALANTE LIZCANO',
  'Numero de Serie o NIV': 'LSJW74U94NZ186987',
  'Version': '1.1'}]


In [None]:
def extract_documents(documents, out_file, chain_extraction, chain_normalize, attributes, max_workers=3):
    t = tqdm(total=len(documents))

    def process_document(doc):
        response = chain_extraction.invoke(input={"image_path": doc})
        t.update()
        return response

    def write_document(data):
        try:
            with jsonlines.open(out_file, mode='a') as writer:
                writer.write_all(data)
        except Exception as e:
            print(f"Error {e} in {data}")

    result_list = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        while len(documents) > 0:
            doc = documents.pop()
            future = executor.submit(process_document, doc)
            result_list.append(future.result())

    # result = chain_normalize.invoke(input={"json_str": json.dumps(result_list), "attributes": attributes})
    write_document(result_list)

In [122]:
csv_file = "./data/bronze/documents/vehicles.jsonl"
invoices = glob.glob("./data/bronze/BASE_AUTOAVANZA/documentos_clean/FAC_FRENTE/Caso 1_TK 62853-1 FAC_FRENTE_otsu.jpg")

In [123]:
attributos = ",".join(
    [
        "Nombre del Cliente",
        "Numero de Serie o NIV",
        "Marca",
        "Modelo",
        "Año",
        "Version",
        "Numero del Motor",
    ]
)
extract_documents(invoices, csv_file, extract_chain, normalize_chain, attributos)

100%|██████████| 1/1 [00:23<00:00, 23.02s/it]


In [124]:
csv_file = "./data/bronze/documents/credenciales.jsonl"
personal_ids = glob.glob("./data/bronze/BASE_AUTOAVANZA/documentos_clean/INE_FRENTE/Caso 1_TK 62853-2 INE_FRENTE_hsv_vchannel_clahe_adaptativo.jpg")

In [125]:
attributos = ",".join(["Nombre", "Domicilio", "Vigencia"])
extract_documents(personal_ids, csv_file, extract_chain, normalize_chain, attributos)

100%|██████████| 1/1 [00:16<00:00, 16.18s/it]


In [126]:
csv_file = "./data/bronze/documents/tarjetas_circulacion.jsonl"
tcs = glob.glob("./data/bronze/BASE_AUTOAVANZA/documentos_clean/TC_FRENTE/Caso 1_TK 62853-4 TC_FRENTE_clean.jpg")

In [128]:
attributos = ",".join(
    [
        "Nombre",
        "Vigencia",
        "Placa",
        "Numero de Serie o NIV",
        "Marca",
        "Modelo",
        "Año",
        "Version",
        "Numero del Motor",
    ]
)
extract_documents(tcs, csv_file, extract_chain, normalize_chain, attributos)

100%|██████████| 1/1 [00:15<00:00, 15.53s/it]
