In [43]:
import fitz  # PyMuPDF
import pdfplumber
import pytesseract
import re
import logging
import time
import os
import pandas as pd

# Set up logging for error tracking
logging.basicConfig(filename='invoice_extractor.log', level=logging.ERROR)

class InvoiceExtractor:
    def __init__(self):
        """Initializes the InvoiceExtractor with necessary parameters."""
        self.accuracy_threshold = 0.90  # Required accuracy threshold

    def extract_text_from_pdf(self, pdf_path):
        """Extracts text from a PDF using pdfplumber."""
        text = ""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text() or ""
        except Exception as e:
            logging.error(f"Error extracting text from PDF: {e}")
        return text

    def extract_text_from_scanned_pdf(self, pdf_path):
        """Extracts text from a scanned PDF using OCR."""
        text = ""
        try:
            document = fitz.open(pdf_path)
            for page_num in range(len(document)):
                page = document[page_num]
                pix = page.get_pixmap()
                img = pix.samples
                text += pytesseract.image_to_string(img)
        except Exception as e:
            logging.error(f"Error processing scanned PDF: {e}")
        return text

    def parse_invoice_data(self, text):
        """Parses the invoice data from the extracted text."""
        invoice_number = re.search(r'Invoice #:\s*(\S+)', text, re.IGNORECASE)
        date = re.search(r'Invoice Date:\s*([\d\w\s]+)', text, re.IGNORECASE)
        total_amount = re.search(r'Total ₹([\d,]+\.\d{2})', text, re.IGNORECASE)
        phone_number = re.search(r'Mobile\s*([\+\d\s]+)', text, re.IGNORECASE)
        address = re.search(r'(?<=C/o)(.*?)(?=Invoice #)', text, re.IGNORECASE)

        return {
            "invoice_number": invoice_number.group(1) if invoice_number else None,
            "date": date.group(1) if date else None,
            "total_amount": total_amount.group(1) if total_amount else None,
            "phone_number": phone_number.group(1).strip() if phone_number else None,
            "address": address.group(0).strip() if address else None,
        }

    def extract_invoice_data(self, pdf_path):
        """Main function to extract invoice data from a PDF."""
        start_time = time.time()
        try:
            text = self.extract_text_from_pdf(pdf_path)

            if not text.strip():  # If no text found, fall back to OCR
                text = self.extract_text_from_scanned_pdf(pdf_path)

            invoice_data = self.parse_invoice_data(text)
            processing_time = time.time() - start_time
            
            # Add processing time to the data
            invoice_data['processing_time'] = processing_time

            return invoice_data, processing_time
        except Exception as e:
            logging.error(f"Error processing {pdf_path}: {e}")
            return None, None  # Return None for both in case of error

# Function to process all PDF files in a directory
def process_invoices(directory):
    """Processes all PDF invoices in a given directory."""
    extractor = InvoiceExtractor()
    results = []

    for filename in os.listdir(directory):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(directory, filename)
            invoice_data, processing_time = extractor.extract_invoice_data(pdf_path)

            # Add filename to the results
            if invoice_data:
                invoice_data['filename'] = filename
                results.append(invoice_data)

    return pd.DataFrame(results)

# Function to save DataFrame to Excel
def save_to_excel(df, excel_path):
    """Saves the DataFrame to an Excel file."""
    try:
        df.to_excel(excel_path, index=False, sheet_name='Invoice Data')  # Save to Excel
        print(f"Data saved successfully to {excel_path}")
    except PermissionError:
        print(f"Permission denied: {excel_path}. Make sure the file is not open and try again.")

# Directory containing PDF files
directory = r'C:\Users\Kaviyarasan PR\OneDrive\Desktop\Jan to Mar'
df = process_invoices(directory)

# Save results to Excel
output_excel_path = r'C:\Users\Kaviyarasan PR\OneDrive\Desktop\invoice_data.xlsx'
save_to_excel(df, output_excel_path)


Data saved successfully to C:\Users\Kaviyarasan PR\OneDrive\Desktop\invoice_data.xlsx
