In [1]:
!pip install camelot-py[cv] img2table openpyxl pandas pillow pytesseract


[0mCollecting img2table
  Downloading img2table-1.4.1-py3-none-any.whl.metadata (21 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting camelot-py[cv]
  Downloading camelot_py-1.0.0-py3-none-any.whl.metadata (9.4 kB)
Collecting pdfminer-six>=20240706 (from camelot-py[cv])
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdf<4.0,>=3.17 (from camelot-py[cv])
  Downloading pypdf-3.17.4-py3-none-any.whl.metadata (7.5 kB)
Collecting pypdfium2>=4 (from camelot-py[cv])
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting xlsxwriter>=3.0.6 (from img2table)
  Downloading xlsxwriter-3.2.5-py3-none-any.whl.metadata (2.7 kB)
Downloading img2table-1.4.1-py3-none-any.whl (91 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [8]:
%pip install img2table

import camelot
import os
from img2table.document import Image as Img2TableImage
from img2table.ocr import TesseractOCR
from PIL import Image
import pandas as pd
from openpyxl import Workbook

def extract_tables_from_pdf(pdf_path):
    # Extract tables from PDF using Camelot
    tables = camelot.read_pdf(pdf_path, pages='all')
    dfs = [table.df for table in tables]
    return dfs

def extract_tables_from_image(image_path):
    # Extract tables from an image using img2table and Tesseract
    img = Img2TableImage(src=image_path)
    ocr = TesseractOCR(lang="eng")
    tables = img.extract_tables(ocr=ocr)
    dfs = []
    for table in tables:
        # Each table is a Table object with the "df" attribute as DataFrame
        dfs.append(table.df)
    return dfs

def save_dfs_to_excel(dfs, output_excel, sheet_base_name="Table"):
    # Save multiple DataFrames to Excel, each to a separate sheet
    with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
        for i, df in enumerate(dfs):
            df.to_excel(writer, sheet_name=f"{sheet_base_name}_{i+1}", index=False)

def main(pdf_files, image_files, output_excel="all_tables_combined.xlsx"):
    all_tables = []
    # Extract from PDFs
    for pdf_file in pdf_files:
        print(f"Extracting tables from PDF: {pdf_file}")
        pdf_tables = extract_tables_from_pdf(pdf_file)
        all_tables.extend(pdf_tables)
    # Extract from images
    for img_file in image_files:
        print(f"Extracting tables from image: {img_file}")
        img_tables = extract_tables_from_image(img_file)
        all_tables.extend(img_tables)
    # Save all tables to a single Excel
    if all_tables:
        save_dfs_to_excel(all_tables, output_excel)
        print(f"All tables saved into {output_excel}")
    else:
        print("No tables found in provided files.")

if __name__ == "__main__":
    # Example file lists
    pdf_files = []   # Your PDF files here
    image_files = ["/content/Revoult Bank statement_page-0001.jpg"]   # Your image files here - Added empty list
    main(pdf_files, image_files, output_excel="tables_from_pdf_and_images.xlsx")

Extracting tables from image: /content/Revoult Bank statement_page-0001.jpg
No tables found in provided files.
