In [188]:
import PyPDF2
import re
import os 
import csv

In [189]:
def extract_text_from_pdf(pdf_path):
    text = []
    with open(pdf_path, mode='rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for i in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[i]
            text.append(page.extract_text())
    return text

In [190]:

def extract_data(text):
    data = {}

    pattern = r'# \d{5}'
    for page_text in text:
        match = re.search(pattern, page_text)
        if match:
            data['invoice_no'] = match.group()
            break

    pattern = r'\D{3} \d{2} \d{4}'
    for page_text in text:
        match = re.search(pattern, page_text)
        if match:
            data['date'] = match.group()
            break

    pattern = r'Bill To\s*:\s*([^\n]+)'
    for page_text in text:
        match = re.search(pattern, page_text)
        if match:
            data['bill_to'] = match.group(1).strip()
            break

    pattern = r'\D\D-\d{4}-\w*-\d*'
    for page_text in text:
        match = re.search(pattern, page_text)
        if match:
            data['order_id'] = match.group()
            break

    pattern = r'\$[\d,]+\.\d*'
    for page_text in text:
        match = re.search(pattern, page_text)
        if match:
            data['balance_due'] = match.group()
            break

    return data


In [191]:
def process_invoices(folder_path, output_csv):
    data_list = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)
            data = extract_data(text)
            data_list.append(data)

    headers = ['invoice_no', 'date', 'bill_to', 'order_id', 'balance_due']
    with open(output_csv, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()
        for row in data_list:
            writer.writerow(row)


folder_path = r'C:\Users\KIIT\Desktop\invoice_extracter\sample_invoices'
output_csv = 'extracted_invoices_data.csv'
process_invoices(folder_path, output_csv)