# LlamaParse Experiment

This notebook experiments with LlamaParse to parse the PDF file `2538-00.pdf` and compare the quality of the output, especially for tables.

In [None]:
# Install necessary packages
# !pip install llama-parse llama-index python-dotenv

In [5]:
import os
import nest_asyncio
from dotenv import load_dotenv

load_dotenv()
nest_asyncio.apply()

# Check for API Key
api_key = os.getenv("LLAMA_CLOUD_API_KEY")
if not api_key:
    print("Please set your LLAMA_CLOUD_API_KEY in the .env file or input it below.")
    # api_key = input("Enter LLAMA_CLOUD_API_KEY: ")
else:
    print("LLAMA_CLOUD_API_KEY found.")

LLAMA_CLOUD_API_KEY found.


In [None]:
from llama_parse import LlamaParse

parser = LlamaParse(
    api_key=api_key,
    result_type="markdown",  # "markdown" and "text" are available
    verbose=True,
    language="ko",  # Set language to Korean
)

file_path = "00_data/raw_data/1_bai_raw_files/2538-00.pdf"
documents = parser.load_data(file_path)

# Show the first document's text (partial)
if documents:
    print(f"Parsed {len(documents)} documents.")
    print("First 2000 characters of the parsed content:")
    print(documents[0].text[:2000])
else:
    print("No documents parsed.")

In [None]:
# Save the result to a markdown file for easier inspection
output_path = "2538-00_parsed.md"
if documents:
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(documents[0].text)
    print(f"Saved parsed content to {output_path}")

## Batch Processing

Now we will process all the PDF files in the directory `00_data/raw_data/1_bai_raw_files/` and save them as markdown files in `00_data/parsed_data/llama_parse/`.

In [None]:
import glob
import os
from tqdm import tqdm

input_dir = "00_data/raw_data/1_bai_raw_files/"
output_dir = "00_data/parsed_data/llama_parse/"

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

# Get list of PDF files
pdf_files = glob.glob(os.path.join(input_dir, "*.pdf"))
print(f"Found {len(pdf_files)} PDF files.")

# Batch process
for pdf_file in tqdm(pdf_files, desc="Parsing files"):
    file_name = os.path.basename(pdf_file)
    file_base = os.path.splitext(file_name)[0]
    output_file = os.path.join(output_dir, f"{file_base}.md")

    # Skip if already exists
    if os.path.exists(output_file):
        # print(f"Skipping {file_name} (already parsed).")
        continue

    try:
        # Use aload_data with await to avoid Event loop issues in Jupyter
        documents = await parser.aload_data(pdf_file)
        if documents:
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(documents[0].text)
            # print(f"Parsed and saved {file_name}")
        else:
            print(f"Warning: No content parsed for {file_name}")
    except Exception as e:
        print(f"Error parsing {file_name}: {e}")