In [79]:
import camelot
import pandas as pd
import re
import os
import sys

In [80]:
def extract_tables_from_pages(pdf_file: str, flavor='stream', page_numbers: list = []) -> list:
    """
    Extract tables from specified pages of a PDF using Camelot.
    """
    # Use camelot to extract tables from the specified pages
    tables = camelot.read_pdf(pdf_file, flavor=flavor, pages=",".join(map(str, page_numbers)))
    extracted_tables = []

    for table in tables:
        # Assume each table is relevant, modify if needed
        if table:
            extracted_tables.append(table.df)

    return extracted_tables

In [81]:
def clean_and_save_to_excel(tables: list, excel_file_path: str):
    """
    Clean tables, drop odd columns, and save them to an Excel file.
    """
    cleaned_tables = [clean_table(table) for table in tables]

    # Save the cleaned DataFrames to an Excel file
    with pd.ExcelWriter(excel_file_path, engine='xlsxwriter') as writer:
        for i, cleaned_table in enumerate(cleaned_tables):
            # Drop odd columns
            cleaned_table = cleaned_table.iloc[:, ::2]
            cleaned_table.to_excel(writer, sheet_name=f'Table_{i+1}', index=False)

    print(f"Data has been copied to {excel_file_path}")

In [82]:
def clean_table(table: pd.DataFrame) -> pd.DataFrame:
    """
    Clean the table data by removing $ signs from columns.
    """
    return table.applymap(lambda x: re.sub('\$', '', str(x)) if pd.notnull(x) else x)

In [83]:
if __name__ == "__main__":
    # Specify your PDF file path
    pdf_input_file = 'meta_1.pdf'

    # Specify the pages you want to extract
    pages_args = "80,81"
    pages_required = [int(p) for p in filter(None, pages_args.split(","))]

    # Extract tables from the specified pages
    tables = extract_tables_from_pages(pdf_input_file, page_numbers=pages_required)

In [84]:
    if not tables:
        sys.exit('No tables found on the specified pages.')

    # Display the extracted tables
    for i, table in enumerate(tables):
        print(f"\nTable {i+1}:\n{table}")

    # Save tables to an Excel file with odd columns dropped
    excel_output_file = pdf_input_file + '_cleaned.xlsx'
    clean_and_save_to_excel(tables, excel_output_file)

    print("Complete")


Table 1:
                                            0  1        2  3        4
0                                                    2021        2020
1                                      Assets                        
2                             Current assets:                        
3                   Cash and cash equivalents  $   16,601  $   17,576
4                       Marketable securities      31,397      44,378
5                    Accounts receivable, net      14,039      11,335
6   Prepaid expenses and other current assets       4,629       2,381
7                        Total current assets      66,666      75,670
8                          Equity investments       6,775       6,234
9                 Property and equipment, net      57,809      45,633
10        Operating lease right-of-use assets      12,155       9,348
11                     Intangible assets, net         634         623
12                                   Goodwill      19,197      19,050
13        