# Model Setup

In [1]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv();

In [2]:
import os, getpass

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("OPENAI_API_KEY")
_set_env("OPENAI_API_BASE")
_set_env("OPENAI_MODEL")

In [3]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_BASE = os.environ.get("OPENAI_API_BASE")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")

In [4]:
llm = ChatOpenAI(
    model=os.environ.get("OPENAI_MODEL"),
    openai_api_key=OPENAI_API_KEY,
    openai_api_base=OPENAI_API_BASE,
    max_tokens=1024
)

# PDF extractor Setup

In [5]:
from langchain_community.document_loaders import PyPDFLoader

In [55]:
file_paths = [
  # "/Users/paulochade/Documents/projects/kanja/dev/main/engine/notebooks/cert/pdf/pr/Certidao_Modelo_Negativa_1.pdf",
  # "/Users/paulochade/Documents/projects/kanja/dev/main/engine/notebooks/cert/pdf/pr/Certidao_Modelo_Negativa_2.pdf",
  "/Users/paulochade/Documents/projects/kanja/dev/main/engine/notebooks/cert/pdf/pr/Certidao_Modelo_Positiva_Suspensa_1.pdf",
  "/Users/paulochade/Documents/projects/kanja/dev/main/engine/notebooks/cert/pdf/pr/Certidao_Modelo_Positiva_Suspensa_2.pdf",
  "/Users/paulochade/Documents/projects/kanja/dev/main/engine/notebooks/cert/pdf/pr/Certidao_Modelo_Positiva.pdf",
  "/Users/paulochade/Documents/projects/kanja/dev/main/engine/notebooks/cert/pdf/sp/cnd_fazenda.pdf",
]

contents = []
for file_path in file_paths:
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    # Combine all pages of current PDF into one string
    pdf_content = "\n\n".join([doc.page_content for doc in docs])
    contents.append(pdf_content)

print(contents)

[' \nPágina 1 de 1 \nEmitido via nnnnnnnnn (dd/mm/aaaa hh:mm:ss) por \n \n \n Certidão Positiva \n de Débitos Tributários e de Dívida Ativa Estadual  \ncom Efeitos de Negativa \n (Art. 206 do CTN) \nNº NNNNNNN N-DD \n \n \n \n \nCertidão fornecida para o (CNPJ/MF, CPF/MF): \nNome: com 150 posições exigidas pelo Cadastro Sincronizado \n \n \nRessalvado o direito da Fazenda Pública Estadual inscrever e cobrar débitos ainda não \nregistrados ou que venham a ser apurados, certificamos que, verificando os registros da Secretaria de \nEstado da Fazenda, constatamos existir pendências cadastradas em nome do contribuinte acima \nidentificado, nesta data, as quais estão com a exigibilidade suspensa nos termos dos incisos II, III e/ou \nVI, do art. 151, do Código Tributário Nacional (Lei 5.172/1966).  \n \nObs.: (para CPF) Esta Certidão engloba pendências do próprio CPF ou pelas quais tenha sido \nresponsabilizado e refere -se a débitos de natureza tributária e não tributária, bem como ao \ndesc

# Extractor setup

In [36]:
from typing import Optional, Literal
from datetime import datetime, date
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI

In [52]:
class TaxCertificate(BaseModel):
    """Information extracted from a São Paulo state tax certificate (Certidão de Débitos)."""

    cpf: Optional[str] = Field(
        default=None, 
        description="""
            The CPF number in format XXX.XXX.XXX-XX.
            It should only contain numbers from 0-9 or the letter X
            If not Sure, leave blank
        """
    )
    debt_exists: Optional[Literal["negative", "positive", "positive_suspended"]] = Field(
        default=None, 
        description="""
            Extract if there are any debts, looking for:
            * positive: ('constam débitos', positiva);
            * negative: ('não constam débitos', negativa);
            * positive_suspended: (positiva mas com efeito de negativa).
            FILL WITH one of the above options.
            IF NOT SURE, leave blank
        """
    )
    issuance_date: Optional[str] = Field(
        default=None,
        description="""
            The date when the certificate was issued in format DD/MM/YYYY.
            Look for 'Data e hora da emissão' or similar phrases.
            IF NOT SURE, leave blank
        """
    )
    validity_date: Optional[str] = Field(
        default=None,
        description="""
            The expiration date of the certificate in format DD/MM/YYYY.
            This can be either explicitly stated in the document or calculated by adding
            the validity period to the issuance date. For example, if issued on 10/08/2010
            with validity of 8 months, the validity date would be 10/04/2011.
            IF NOT SURE, leave blank
        """
    )
    certificate_number: Optional[str] = Field(
        default=None, 
        description="""
            The certificate number (número da certidão).
            It should only contain numbers from 0-9, and a maximun of one dash ('-') caracter.
            IF NOT SURE, leave blank
        """
    )

    @staticmethod
    def parse_date(date_str: Optional[str]) -> Optional[str]:
        """Parses a string in DD/MM/YYYY format and validates it."""
        if date_str:
            try:
                # Ensure the date is valid and properly formatted
                parsed_date = datetime.strptime(date_str, "%d/%m/%Y")
                return parsed_date.strftime("%d/%m/%Y")
            except ValueError:
                return None
        return None


    def __init__(self, **data):
        if 'issuance_date' in data:
            data['issuance_date'] = self.parse_date(data.get('issuance_date'))
        if 'validity_date' in data:
            data['validity_date'] = self.parse_date(data.get('validity_date'))
        super().__init__(**data)

In [53]:
# Define custom prompt template
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert at extracting information from Brazilian tax certificates. "
            "Extract only the specific information requested. "
            "Return null if you cannot find the information with certainty. "
            "For the debt_status, look specifically for phrases like 'constam débitos' or 'não constam débitos'. "
            "For CPF, ensure it's in the correct format with dots and dash."
            "For certificate_number, ensure that it only has one '-' dash caracter and no whitespaces"
        ),
        ("human", "{text}"),
    ]
)

structured_llm = llm.with_structured_output(schema=TaxCertificate)

In [54]:
for content in contents:
    result = structured_llm.invoke(content)
    print(result)   

cpf=None debt_exists='negative' issuance_date=None validity_date=None certificate_number='NNNNNNNN-DD'
cpf=None debt_exists='negative' issuance_date=None validity_date=None certificate_number='NNNNNNNN-DD'
cpf=None debt_exists='positive_suspended' issuance_date=None validity_date=None certificate_number='NNNNNNNN-N-DD'
cpf=None debt_exists='positive_suspended' issuance_date=None validity_date=None certificate_number='NNNNNNNNN-DD'
cpf='X' debt_exists='positive' issuance_date=None validity_date=None certificate_number='NNNNNNNN-DD'
cpf='527.281.628-59' debt_exists='negative' issuance_date='27/01/2025' validity_date='27/07/2025' certificate_number='25010999469-66'
