In [15]:
import cv2
import pytesseract
import pandas as pd
import os
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Alignment

def clean_numeric(value):
    return ''.join(ch for ch in value if ch.isdigit() or ch in ',.')

image_directory = 'savedimages'
excel_path = 'invoice_data.xlsx'

column_mappings = {
    'BA': 'Billing Address',
    'INV': 'Invoice #',
    'SA': 'Shipping Address',
    'ORD_DATE': 'Order Date',
    'INV_DATE': 'Invoice Date',
    'TOTAL': 'Total Amount',
    # Include additional mappings as needed
}

df = pd.DataFrame(columns=column_mappings.values())  

invoice_data = {column: "" for column in column_mappings.values()} 

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Path to Tesseract-OCR

for filename in os.listdir(image_directory):
    if filename.endswith(".png"):  
        file_path = os.path.join(image_directory, filename)
        img = cv2.imread(file_path)
        extracted_text = pytesseract.image_to_string(img)
        parts = filename.split('_')
        class_name = '_'.join(parts[:-1]) 
        if class_name in column_mappings:
            column_name = column_mappings[class_name]
            if column_name == 'Total Amount':
                extracted_text = clean_numeric(extracted_text)
            invoice_data[column_name] = extracted_text.strip()

new_row = pd.DataFrame([invoice_data])  
df = pd.concat([df, new_row], ignore_index=True)

writer = pd.ExcelWriter(excel_path, engine='openpyxl')
df.to_excel(writer, index=False)

workbook = writer.book
worksheet = writer.sheets['Sheet1']

for column_cells in worksheet.columns:
    length = max(len(str(cell.value)) for cell in column_cells)
    worksheet.column_dimensions[column_cells[0].column_letter].width = length

writer.close()
print("Excel file created and data stored successfully.")


Excel file created and data stored successfully.
