In [1]:
import os
import csv
import json
import pdfplumber
from openai import OpenAI
from dotenv import load_dotenv

In [2]:
# Load environment variables
load_dotenv()

True

In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [4]:
data_path = os.path.exists("../data/raw")
data_path

True

In [5]:
# OpenAI API Test

client = OpenAI(api_key=OPENAI_API_KEY)
system_prompt = "You are an AI Asistant expert in python"
user_prompt = "Give me the pythin zen"

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user","content": user_prompt}
    ],
    temperature=0.0
)

print(completion.choices[0].message.content)

The Zen of Python is a collection of guiding principles for writing computer programs in the Python language. You can access it by running the following command in a Python interpreter:

```python
import this
```

Here are the principles from the Zen of Python:

```
Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than right now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, 

In [6]:
# Function to extract text from PDF file
def extract_text_from_pdf(pdf_path):
    """
    Extracts text page-by-page from a PDF file using pdfplumber.
    Returns a list of strings, each representing the text of a page.
    """
    pages_text = []
    with pdfplumber.open(pdf_path) as pdf_file:
        for page in pdf_file.pages:
            text = page.extract_text()
            if text:
                pages_text.append(text)
    return pages_text

In [7]:
# test function

pages_text = extract_text_from_pdf("../data/raw/credit-card/202109.pdf")
print(pages_text)
print(type(pages_text))
print(len(pages_text))


['JASON/SEPULVEDA SALAZAR\nPAN/6/SAN FRANCISCO\nSAN FRANCISCO SEPTIEMBRE 2021\nPH TEE ONE, 19A. REPUBLICA DE\nLA INDIA CON AV BELISARIO PORR\nRESUMEN DE ESTADO DE CUENTA DE TARJETA DE CRÉDITO\nInformación de la Cuenta Información de Pago\nN° Cuenta 4029-****-****-5075 Saldo $825.56\nProducto VISA DORADA CONNECTMILES DORADA Pago Mínimo $20.00\nLímite $2,800.00 Pago de Contado $825.00\nDisponible $1,974.44 Fecha Límite de Pago de Contado 20/SEP/2021\nFecha de Corte 03/09/2021 Fecha Límite de Pago 24/SEP/2021\nFecha de Corte Anterior 03/08/2021\nPrograma de Puntos PUNTOS\nCONNECTMILES\nPuntos Disponibles 808\nAtención al cliente: 210-4652 1 de 3', 'JASON/SEPULVEDA SALAZAR\nPAN/6/SAN FRANCISCO\nSAN FRANCISCO\nPH TEE ONE, 19A. REPUBLICA DE\nLA INDIA CON AV BELISARIO PORR\nDETALLE DE CUENTA\nFecha N° de Referencia Descripción Monto\nSaldo Anterior $2,493.62\n4029-****-****-5075\nJASON/SEPULVEDA SALAZAR\nAGO/04 08041440094965214 SU PAGO RECIBIDO GRACIAS $2,493.62-\nAGO/06 080668601583 001 PAP

In [9]:
def call_openai_api_for_card_data_extraction(text):
    """
    Sends 'text' to the OpenAI model with instructions to extract
    banking transactions from credit card statement in a JSON format.

    Returns a list of dictionaries with the following keys:
        - date
        - reference
        - description
        - amount (float)
    """
    # Define the system and user prompts
    system_prompt = (
        "You are an assistant that extracts banking transactions from credit card statement from text. "
        "Each credit card transaction typically appears in the format:\n\n"
        "DATE REFERENCE DESCRIPTION AMOUNT\n\n"
        "For example:\n"
        "AGO/04 08041440094965214 PEDIDOSYA $2,493.62-\n\n"
        "Instructions:\n"
        "1. Identify all lines that represent transactions.\n"
        "2. Return an array of JSON objects with these fields:\n"
        "   - date\n"
        "   - reference\n"
        "   - description\n"
        "   - amount (float)\n"
        "   (If the amount ends with '-', make it negative.)\n"
        "3. Return only the JSON, with no extra text.\n"
    )

    user_prompt = f"PDF text:\n\n{text}\n\nExtract the transactions in JSON."

    try:
        # Call the OpenAI API
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=0.0,
        )

        # Extract the data from the API response
        content = completion.choices[0].message.content
        transactions = json.loads(content)

        # return a list
        if isinstance(transactions, dict):
            transactions = [transactions]
        return transactions
    
    except Exception as e:
        print("Error when processing text with OpenAI:", e)
        return []

In [10]:
response = call_openai_api_for_card_data_extraction(pages_text[1])
print(response)
print(type(response))
print(len(response))

[{'date': 'AGO/04', 'reference': '08041440094965214', 'description': 'SU PAGO RECIBIDO GRACIAS', 'amount': -2493.62}, {'date': 'AGO/06', 'reference': '080668601583', 'description': '001 PAPA JOHNS CALLE 50', 'amount': 25.28}, {'date': 'AGO/07', 'reference': '080768622675', 'description': '132 REY CALLE 50', 'amount': 128.06}, {'date': 'AGO/11', 'reference': '081379000605', 'description': '102 AL VECCHIO FORNO BELLA VISTA', 'amount': 26.9}, {'date': 'AGO/14', 'reference': '081579000605', 'description': '808 REST Y PIZZERIA LA LOC JUAN DIAZ', 'amount': 51.95}, {'date': 'AGO/15', 'reference': '081979000605', 'description': '372 PEDIDOSYA BELLA VISTA', 'amount': 7.49}, {'date': 'AGO/16', 'reference': '081668622696', 'description': '025 FARMACIA ARROCHA CALLE 50', 'amount': 55.91}, {'date': 'AGO/20', 'reference': '082279000605', 'description': '916 L Azotea Panama Panama', 'amount': 94.77}, {'date': 'AGO/21', 'reference': '082268687499', 'description': '309 PRICESMART -I-', 'amount': 292.3}

In [11]:
def save_transactions_to_csv(transactions_list, csv_filename="extracted_transactions.csv"):
    """
    Creates the 'data/preprocessed' folder if it doesn't exist, 
    and saves the transactions to a CSV file inside that folder.
    """
    preprocessed_folder = "data/preprocessed"
    os.makedirs(preprocessed_folder, exist_ok=True)
    
    output_csv_path = os.path.join(preprocessed_folder, csv_filename)

    # Define the columns to write in the CSV
    fieldnames = ["date", "reference", "description", "amount", "file", "page"]

    # Write the CSV file
    with open(output_csv_path, mode="w", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for transaction in transactions_list:
            writer.writerow(transaction)

    print(f"Transactions extracted and saved in '{output_csv_path}'.")

In [12]:
def process_pdfs_in_folder(input_folder):
    """
    Iterates over all PDF files in 'input_folder', extracts text page by page,
    calls the OpenAI API for each page to identify transactions, and accumulates them.
    Finally, saves the transactions to a CSV file in the 'data/preprocessed' folder.
    """
    all_transactions = []

    for filename in os.listdir(input_folder):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(input_folder, filename)
            print(f"Processing: {pdf_path}")

            # 1. Extract text from each page of the PDF
            pages_text = extract_text_from_pdf(pdf_path)

            # 2. For each page, call OpenAI to identify transactions
            for page_index, text in enumerate(pages_text, start=1):
                extracted_transactions = call_openai_api_for_card_data_extraction(text)
                
                # Add metadata for file and page
                for transaction in extracted_transactions:
                    transaction["file"] = filename
                    transaction["page"] = page_index
                
                all_transactions.extend(extracted_transactions)

    # 3. Save all transactions to a CSV
    save_transactions_to_csv(all_transactions, csv_filename="extracted_transactions.csv")

In [13]:
input_folder_path = "/Users/jasonssdev/Dev/Projects/bank-transactions/data/raw"
process_pdfs_in_folder(input_folder_path)

Transactions extracted and saved in 'data/preprocessed/extracted_transactions.csv'.
