In [1]:
import nest_asyncio
from llama_index.core import SimpleDirectoryReader
from llama_parse import LlamaParse
import os
from dotenv import load_dotenv
nest_asyncio.apply()

In [2]:
load_dotenv()
parser = LlamaParse(
    api_key=os.getenv("cloud_api_key"),  # Replace with your LlamaParse API key
    result_type="markdown",  # We want markdown output
    num_workers=4,  # Number of workers for parallel processing of multiple files
    verbose=True,  # Enables logging of detailed information
    language="en",  # Language of the document (optional)
)

In [5]:
file_directory = "/Users/user/Documents/Project/___e-zest___/Detecting-and-Parsing-table-from-pdfs/1.using_cv2/out_tables"
file_list = [[f for f in os.listdir(file_directory) if f.lower().endswith(('.pdf', '.png', '.jpg', '.jpeg'))][0]]
file_list = ['Table_0.png']


In [6]:
for file_name in file_list:
    file_path = os.path.join(file_directory, file_name)
    
    with open(file_path, "rb") as file:
        documents = parser.load_data(file, extra_info={"file_name": file_name})
    if documents:
        print(f"Markdown for {file_name}:")
        print(documents)  
    else:
        print(f"No data parsed for {file_name}")


Started parsing the file under job_id 10c45fa4-917c-455d-bd51-84036ff18d4e
Markdown for Table_0.png:
[Document(id_='baa2a5cb-f169-4802-aaa3-a7d054edcc10', embedding=None, metadata={'file_name': 'Table_0.png'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='# Part C\n\n# Part C OTC\n\n|Monthly Frequency|Part C OTC Comments|\n|---|---|\n|OTC Not Administered by Navitus|Geisinger Administered OTC Benefit Part D|\n|Plan Type|Defined Standard Benefit Plan|\n|Formulary ID:|00024417|\n|Formulary|Lower Level Exception Tier|\n|NIA|Formulary NIA|\n|One Month Supply Limit|30|\n|LTC One Month Supply Limit|31|\n|OON Day Supply|30|\n|Mail Order Day Supply|100|\n|Over-The-Counter (OTC) Items as a supplemental benefit under Part D|No|\n|DMR Methodology|Contracted OOP Threshold|\n\n# Part D\n\n|Insulin|1-30 Day|31-60 Day|61-100 Day|1-30 Day|31-60 Day|61-100 Day|30 Day|31 Day|\n|---|---|---|---|---|---|

In [7]:
markdown_data = documents[0].text
markdown_data

'# Part C\n\n# Part C OTC\n\n|Monthly Frequency|Part C OTC Comments|\n|---|---|\n|OTC Not Administered by Navitus|Geisinger Administered OTC Benefit Part D|\n|Plan Type|Defined Standard Benefit Plan|\n|Formulary ID:|00024417|\n|Formulary|Lower Level Exception Tier|\n|NIA|Formulary NIA|\n|One Month Supply Limit|30|\n|LTC One Month Supply Limit|31|\n|OON Day Supply|30|\n|Mail Order Day Supply|100|\n|Over-The-Counter (OTC) Items as a supplemental benefit under Part D|No|\n|DMR Methodology|Contracted OOP Threshold|\n\n# Part D\n\n|Insulin|1-30 Day|31-60 Day|61-100 Day|1-30 Day|31-60 Day|61-100 Day|30 Day|31 Day|\n|---|---|---|---|---|---|---|---|---|\n|Copays|$35.00|$70.00|$87.50|$35.00|$52.50|$52.50|$35.00|$35.00|\n\n# DAW Processing Requirements\n\nStandard DAW 0,1,2,5,8 and 9 will process at POS.\n\nDAW 3,4,6 and 7 will reject at POS with error 22 (WI DA product selection code).\n\nClaim does not reject with NCPDP 22 and processes with the following:\n\n- Generic Reimbursement (Lesser o

In [None]:
import pandas as pd
import re

# Input text

# Regular expression to identify markdown tables
table_pattern = re.compile(r'(\|.+\|\n\|[-| ]+\|(?:\n\|.*\|)+)')

# Find all tables in the text
tables = table_pattern.findall(markdown_data)

dataframes = []

# Parse each table dynamically
for table in tables:
    rows = table.strip().split('\n')
    headers = [header.strip() for header in rows[0].split('|') if header.strip()]
    data = [
        [cell.strip() for cell in row.split('|') if cell.strip()]
        for row in rows[2:]  # Skip the headers and separator rows
    ]
    df = pd.DataFrame(data, columns=headers)
    dataframes.append(df)

# Example: Display all dataframes
for i, df in enumerate(dataframes):
    print(f"\nTable {i+1}:")
    print(df)


# To work with each dataframe, use the `dataframes` list, e.g., `dataframes[0]` for the first table.



Table 1:
                                    Monthly Frequency  \
0                     OTC Not Administered by Navitus   
1                                           Plan Type   
2                                       Formulary ID:   
3                                           Formulary   
4                                                 NIA   
5                              One Month Supply Limit   
6                          LTC One Month Supply Limit   
7                                      OON Day Supply   
8                               Mail Order Day Supply   
9   Over-The-Counter (OTC) Items as a supplemental...   
10                                    DMR Methodology   

                          Part C OTC Comments  
0   Geisinger Administered OTC Benefit Part D  
1               Defined Standard Benefit Plan  
2                                    00024417  
3                  Lower Level Exception Tier  
4                               Formulary NIA  
5                