In [4]:
import PyPDF2
import openpyxl
import re

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

# Function to parse and structure tables from text
def parse_tables(text):
    lines = text.split("\n")
    tables = []
    table = []
    headers = ["Date", "Description", "Amount", "Balance"]  # Recognized headers
    for line in lines:
        if any(header in line for header in headers):  # Identify new table header
            if table:  # Save the existing table before starting a new one
                tables.append(table)
                table = []
            table.append(line.split())  # Add header row
        elif not line.strip():  # Empty line signals end of a table
            if table:
                tables.append(table)
                table = []
        else:
            table.append(line.split())  # Add data row
    if table:  # Append the last table if it exists
        tables.append(table)
    return tables

# Function to clean special characters from data
def clean_data(cell):
    return re.sub(r'[^\x20-\x7E]', '', cell)  # Remove disallowed or special characters

# Function to write data into Excel
def write_to_excel(tables, excel_path):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = "Extracted Tables"

    for table in tables:
        for row in table:
            cleaned_row = [clean_data(cell) for cell in row]
            sheet.append(cleaned_row)
        sheet.append([])  # Add an empty row between tables for clarity

    workbook.save(excel_path)

# Main script
pdf_path = "/content/drive/MyDrive/test3.pdf"  # Replace with your PDF file path
excel_path = "/content/drive/MyDrive/test3.xlsx"  # Replace with desired Excel output path

# Step-by-step execution
text = extract_text_from_pdf(pdf_path)  # Extract text from PDF
tables = parse_tables(text)  # Parse tables from extracted text
write_to_excel(tables, excel_path)  # Write tables into Excel

print(f"Tables extracted and saved to {excel_path}.")


Tables extracted and saved to /content/drive/MyDrive/test3.xlsx.


In [2]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m174.1/232.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
