In [12]:
import cv2
import pytesseract
import re

# Load the image using OpenCV
# image_path = 'galicia_assets/galicia.jpeg'
image_path = 'banco_santa_fe_assets/banco_santa_fe.png'

image = cv2.imread(image_path)

# Convert the image to grayscale
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Use pytesseract to extract text from the grayscale image
extracted_text = pytesseract.image_to_string(gray_image)

# print(extracted_text)

lines = extracted_text.strip().split('\n')
line_24 = lines[24] 
# line_34 = lines[34]
line = line_24




# regular expression patterns

bank_pattern = 'Banco Santa Fe'

date_pattern = r'\b\d{1,2}/\d{1,2}/\d{4}\b'
amount_pattern = r'\$\s*\d[\d,\.]*'
proof_number_pattern = r'\b\d{8}\b'


#Matches the name 33 & 34 line from lines
payer_name_pattern = line

cuit_pattern = r'\b\d{11}\b'

# Find all occurrences of dates, amounts, and CUIT numbers in the extracted text

bank_found = re.findall(bank_pattern, extracted_text, re.IGNORECASE)


dates_found = re.findall(date_pattern, extracted_text)
amounts_found = re.findall(amount_pattern, extracted_text)
proof_number_found = re.findall(proof_number_pattern, extracted_text)
payer_name_found = re.findall(payer_name_pattern, extracted_text)
cuit_found = re.findall(cuit_pattern, extracted_text)

# Extract the data if found

bank = bank_found[0] if bank_found else None


date = dates_found[0] if dates_found else None
amount = amounts_found[0] if amounts_found else None

#this line is important to extract the proof number
proof_number = proof_number_found[0] if proof_number_found else None

#this line is important to extract the payer name
# payer = re.sub(r'[0-9,-]+', '', payer_name_pattern)
payer = payer_name_found[0] if payer_name_found else None


#this line is important to extract the cuit number
cuit = cuit_found[0] if cuit_found else None



# Print the extracted information
print(f"Bank: {bank}")
print(f"Date found: {date}")
print(f"Amount found: {amount}")
print(f"Payer name: {payer}")
print(f"CUIT found: {cuit}")
print(f"Proof number found: {proof_number}")


Bank: Banco Santa Fe
Date found: 20/03/2024
Amount found: $ 329.999,67
Payer name: BIASI, PAULA MONICA
CUIT found: 27215943351
Proof number found: 00461229


In [3]:
#takes some files are extracted and save the data
import cv2
import pytesseract
import re
import os
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment

# Function to extract data from an image and return the extracted information
def extract_data_from_image(image_path):
    # Load the image using OpenCV
    image = cv2.imread(image_path)

    # Convert the image to grayscale
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Use pytesseract to extract text from the grayscale image
    extracted_text = pytesseract.image_to_string(gray_image)

    lines = extracted_text.strip().split('\n')
    line_24 = lines[24] 
    line = line_24


    # Define regular expression patterns for different types of data to extract
    bank_pattern = 'Banco Santa Fe'

    date_pattern = r'\b\d{1,2}/\d{1,2}/\d{4}\b'
    amount_pattern = r'\$\s*\d[\d,\.]*'
    proof_number_pattern = r'\b\d{8}\b'
    
    payer_name_pattern = line
    
    cuit_pattern =  r'\b\d{11}\b'

    # Find all occurrences of dates, amounts, and CUIT numbers in the extracted text
    bank_found = re.findall(bank_pattern, extracted_text, re.IGNORECASE)

    dates_found = re.findall(date_pattern, extracted_text)
    amounts_found = re.findall(amount_pattern, extracted_text)
    proof_number_found = re.findall(proof_number_pattern, extracted_text)
    payer_name_found = re.findall(payer_name_pattern , extracted_text)
    cuit_found = re.findall(cuit_pattern, extracted_text)  # Use cuit_pattern

    # Extract the data if found
    bank = bank_found[0] if bank_found else None

    date = dates_found[0] if dates_found else None
    amount = amounts_found[0] if amounts_found else None
    
    #this line is important to extract the proof number
    proof_number = proof_number_found[0] if proof_number_found else None
    
    #this line is important to extract the payer name
    # payer = re.sub(r'[0-9,-]+', '', payer_name_pattern)
    payer = payer_name_found[0] if payer_name_found else None
    
    #this line is important to extract the cuit number
    cuit = cuit_found[0] if cuit_found else None


    # Return the extracted information as a dictionary
    return {
        'BANCO': bank_pattern,
        'FECHA': date,
        'IMPORTE': amount,
        'TITULAR': payer,
        'CUIT': cuit,
        'NRO COMPROBANTE': proof_number
    }

# Create a new Excel workbook
wb = Workbook()
ws = wb.active
ws.title = "Extracted Data"

# Add headers to the Excel sheet
headers = [ 'FECHA', 'IMPORTE', 'NRO COMPROBANTE', 'TITULAR', 'CUIT', 'BANCO']
ws.append(headers)

# Apply bold style to the header cells
bold_font = Font(bold=True)
for cell in ws[1]:
    cell.font = bold_font

# Apply padding to all cells
for row in ws.iter_rows():
    for cell in row:
        cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)

# Define the folder containing the images
folder_path = 'banco_santa_fe_assets'

# Process each image in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".jpg") or filename.endswith(".png") or filename.endswith(".jpeg"):
        # Extract data from the image
        image_path = os.path.join(folder_path, filename)
        extracted_data = extract_data_from_image(image_path)

        # Add the extracted data to the Excel sheet
        row_data = [extracted_data[header] for header in headers]
        ws.append(row_data)

# Automatically adjust column widths based on content
for column_cells in ws.columns:
    max_length = 0
    for cell in column_cells:
        try:
            if len(str(cell.value)) > max_length:
                max_length = len(cell.value)
        except:
            pass
    adjusted_width = (max_length + 2) * 1.2  # Adjust the multiplier as needed
    ws.column_dimensions[column_cells[0].column_letter].width = adjusted_width

# Save the Excel file

wb.save('Banco_Ciudad_extracted_info.xlsx')


In [1]:
# to convert the pdf to image expermenting
from pdf2image import convert_from_path
import os

# Path to the folder containing the PDF files
pdf_folder_path = 'banco_santa_fe_assets'

# Get the list of PDF files in the folder
pdf_files = [file for file in os.listdir(pdf_folder_path) if file.endswith('.pdf')]

# Convert each PDF file to images
for pdf_file in pdf_files:
    # Construct the file paths
    pdf_file_path = os.path.join(pdf_folder_path, pdf_file)
    image_file_path = os.path.join(pdf_folder_path, os.path.splitext(pdf_file)[0] + '.png')

    # Convert PDF to list of PIL images
    images = convert_from_path(pdf_file_path)

    # Save each page of the PDF as an image file
    for i, image in enumerate(images):
        image.save(image_file_path, 'PNG')

print('PDFs converted to images successfully.')


PDFs converted to images successfully.
