# mercado pada

In [2]:
#takes some files are extracted and save the data
import cv2
import pytesseract
import re
import os
from openpyxl import Workbook

# Function to extract data from an image and return the extracted information
def extract_data_from_image(image_path):
    # Load the image using OpenCV
    image = cv2.imread(image_path)

    # Convert the image to grayscale
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Use pytesseract to extract text from the grayscale image
    extracted_text = pytesseract.image_to_string(gray_image)

    # Define regular expression patterns for different types of data to extract
    date_pattern = r'\b\d{1,2} de [a-z]+ \d{4}\b'
    # amount_pattern = r'\$\s*\d+(?:[.,]\d+)?'
    amount_pattern = r'\$\s*\d[\d,\.]*' #updated to match the amount pattern
    payer_name_pattern = r'e \[(.+?)\]' 
    cuit_pattern = r'\b\d{2}-\d{8}-\d{1}\b'
    proof_number_pattern = r'\b\d{11}\b'

    # Find all occurrences of dates, amounts, and CUIT numbers in the extracted text
    dates_found = re.findall(date_pattern, extracted_text)
    amounts_found = re.findall(amount_pattern, extracted_text)
    payer_name_found = re.findall(payer_name_pattern , extracted_text)
    cuit_found = re.findall(cuit_pattern, extracted_text)
    proof_number_found = re.findall(proof_number_pattern, extracted_text)

    # Extract the first date, amount, and CUIT number if any are found
    date = dates_found[0] if dates_found else None
    amount = amounts_found[0] if amounts_found else None
    payer = payer_name_found[0] if payer_name_found else None
    cuit = cuit_found[0] if cuit_found else None
    proof_number = proof_number_found[0] if proof_number_found else None

    # Return the extracted information as a dictionary
    return {
        'Bank': 'mercado pago',
        'Date': date,
        'Amount': amount,
        'Payer Name': payer,
        'CUIT': cuit,
        'Proof Number': proof_number
    }

# Create a new Excel workbook
wb = Workbook()
ws = wb.active
ws.title = "Extracted Data"

# Add headers to the Excel sheet
headers = ['Bank', 'Date', 'Amount', 'Payer Name', 'CUIT', 'Proof Number']
ws.append(headers)

# Define the folder containing the images
folder_path = 'mercado_pago_assets'

# Process each image in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".jpg") or filename.endswith(".png") or filename.endswith(".jpeg"):
        # Extract data from the image
        image_path = os.path.join(folder_path, filename)
        extracted_data = extract_data_from_image(image_path)

        # Add the extracted data to the Excel sheet
        row_data = [extracted_data[header] for header in headers]
        ws.append(row_data)

# Save the Excel file
wb.save('extracted_info.xlsx')
