In [None]:
import os
import pypdf
import pandas as pd

In [16]:
os.makedirs('iata_done', exist_ok=True)
iata_data = [['file', 'page_number', 'content_snippet']]

In [None]:
for file in os.listdir('./iata'):
    if file.endswith('.PDF'):
        file_path = os.path.join('./iata', file)
        # Extract pages from the PDF
        pdf = pypdf.PdfReader(file_path)
        for page_number, page in enumerate(pdf.pages):
            content = page.extract_text()
            iata_data.append([file, page_number, content])

In [26]:
print(iata_data[1][2])

FCAGBILLSUMNG AGENT BILLING SUMMARY 63-2 1666 1 - OREX TRAVEL Sp. z o.o.
REFERENCE: 63216661 - 240101
OREX TRAVEL Sp. z o.o.  
Aleje Jerozolimskie 132
Warsaw 02-305    
Poland
International Air Transport Association
BSPPOLAND
UL.SZPITALNA 6 1B
00-031 WARSZAWA
Warsaw 00-031
Poland
VAT : PL9542736185
Billing Period: 240101 ( 01-JAN-2024 to 07-JAN-2024 ) 
NOTES:  Balance Payable: Cash Issues - Cash Refunds + Debit Memos - Credit Memos - Effective  Comm -/+ Tax on Comm
‘CREDIT’ - includes both Debit and Credit Payment Card values
where an entity has only one form of payment, no TOTAL line will be displayed.
SUMMARY
ISSUES REFUNDS    ------MEMOS------      
CREDITSDEBITS
TAX ON
COMM
BALANCE
PAYABLE
TAXES,FEES
PENALTIES COMM
EFFECTIVE
BSP - INTERNATIONAL CASH  4,065.62  0.00  0.00  0.00  1,107.62  0.00  0.00  4,065.62
TOTAL  4,065.62  0.00  0.00  0.00  1,107.62  0.00  0.00  4,065.62
 4,065.62  0.00GRAND TOTAL - (PLN)    CASH  0.00  0.00  1,107.62  0.00  0.00  4,065.62
TOTAL  4,065.62  0.00  

In [2]:
import os
import pypdf
import pandas as pd
import re
from datetime import datetime

# Create output directory
os.makedirs('iata_done', exist_ok=True)

# Data structure to store extracted data
iata_data = []

# Define regex patterns for extraction
reference_pattern = r"REFERENCE: ([\d\s-]+)"
document_date_pattern = r"\d{2}:\d{2}:\d{2}[AP]M\sPage\s:\s\d{5}(\d{2}-\w{3}-\d{4})"
period_pattern = r"Billing Period: \d+ \( (\d{2}-\w{3}-\d{4}) to (\d{2}-\w{3}-\d{4}) \)"
# Regex for finding all amounts
amount_pattern = r"\d{1,3}(?:,\d{3})*\.\d{2}"

# Function to convert date to Excel-compatible format
def convert_date(date_str):
    try:
        return datetime.strptime(date_str, '%d-%b-%Y').strftime('%Y-%m-%d')
    except ValueError:
        return None

# Process PDFs in the './iata/' directory
for file in os.listdir('./iata'):
    if file.endswith('.PDF'):
        file_path = os.path.join('./iata', file)
        print(f"Processing: {file_path}")
        try:
            # Use PdfReader to read the PDF
            pdf = pypdf.PdfReader(file_path)
            full_text = ""
            for page in pdf.pages:
                full_text += page.extract_text()
            
            # Extract fields using regex
            reference_no = re.search(reference_pattern, full_text)
            document_date = re.search(document_date_pattern, full_text)
            period = re.search(period_pattern, full_text)
            
            # Find all amounts matching the pattern
            all_amounts = re.findall(amount_pattern, full_text)
            # Convert to floats by replacing ',' with '' and find the maximum
            all_amounts = [float(amount.replace(",", "")) for amount in all_amounts]
            total_amount = max(all_amounts) if all_amounts else None
            
            # Convert extracted dates to Excel-compatible format
            document_date_converted = convert_date(document_date.group(1)) if document_date else None
            period_start_converted = convert_date(period.group(1)) if period else None
            period_end_converted = convert_date(period.group(2)) if period else None
            
            # Add extracted data to the list
            iata_data.append([
                file,
                reference_no.group(1).strip() if reference_no else None,
                document_date_converted,
                period_start_converted,
                period_end_converted,
                total_amount
            ])
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

# Save the extracted data to an Excel file
df = pd.DataFrame(iata_data, columns=['plik', 'nr dokumentu', 'data dokumentu', 'początek okresu', 'data okresu', 'kwota'])
output_file = 'iata_extracted_data.xlsx'
df.to_excel(output_file, index=False)

print(f"Extraction complete. Data saved to {output_file}.")


Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240101.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240102.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240103.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240104.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240201.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240202.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240203.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240204.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240301.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240302.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240303.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240304.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240401.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240403.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240404.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240502.PDF
Processing: ./iata\PL_FCAGBILLSUMNG_6321666_20240503.PDF
Processing: ./iata\PL_FCAGBILLS