### PDF parsing with LLAMA Parse

In [5]:
import os 
from dotenv import load_dotenv
load_dotenv()
os.environ['LLAMA_CLOUD_API_KEY'] = os.getenv("LLAMA_CLOUD_API_KEY")
### Only needed for notebook
import nest_asyncio
nest_asyncio.apply()
from llama_parse import LlamaParse

In [6]:
## Initiate parser
parser = LlamaParse(
    api_key=os.environ['LLAMA_CLOUD_API_KEY'],
    result_type="markdown",  # "markdown" and "text" are available
    num_workers=4,  # if multiple files passed, split in `num_workers` API calls
    verbose=True,
    language="en",  # Optionally you can define a language, default=en
)

In [25]:
docs = ['/data/home/xiong/data/Fund/pdf_parse/111_2023_0_table1.pdf','/data/home/xiong/data/Fund/pdf_parse/111_2024_0_table1.pdf']

#### Basic Parse

In [8]:
documents = parser.load_data(docs[0])

# # sync batch
# documents = parser.load_data(["./my_file1.pdf", "./my_file2.pdf"])
# # async
# documents = await parser.aload_data("./my_file.pdf")
# # async batch
# documents = await parser.aload_data(["./my_file1.pdf", "./my_file2.pdf"])

Started parsing the file under job_id 0f04aa4d-cbd1-4609-ae48-c8c7bdd04e80
....

In [13]:
print(documents[0].text)

# UNITED STATES

# Table 1. United States: Selected Economic Indicators, 2020-29

# (Percentage change from previous period, unless otherwise indicated)

| |Projections|2020|2021|2022|2023|2024|2025|2026|2027|2028|2029|
|---|---|---|---|---|---|---|---|---|---|---|---|
|National Production and Income| |-2.2|5.8|1.9|2.5|2.6|1.9|2.0|2.1|2.1|2.1|
|Real GDP (q4/q4)| |-1.1|5.4|0.7|3.1|2.0|1.8|2.1|2.1|2.1|2.1|
|Net exports 1/| |-0.2|-1.3|-0.5|0.6|-0.2|0.0|0.1|0.2|0.2|0.2|
|Total domestic demand| |-1.9|6.9|2.3|1.9|2.6|1.8|1.8|1.9|1.9|1.9|
|Final domestic demand| |-1.5|6.6|1.7|2.3|2.7|1.8|1.8|1.9|1.9|1.9|
|Private final consumption| |-2.5|8.4|2.5|2.2|2.3|1.5|1.4|1.6|1.6|1.6|
|Public consumption expenditure| |2.9|0.3|-0.9|2.7|2.0|1.5|1.3|1.3|1.3|1.3|
|Gross fixed domestic investment| |-1.0|5.3|0.9|2.1|4.5|3.1|3.6|3.2|3.1|3.1|
|Private fixed investment| |-2.1|7.1|1.3|0.6|4.0|2.9|3.7|3.8|3.8|3.8|
|Public fixed investment| |4.3|-2.8|-1.1|9.4|7.1|3.9|3.4|0.5|0.0|0.0|
|Change in private inventories 

In [15]:
instruct_parser = LlamaParse(
    api_key=os.environ['LLAMA_CLOUD_API_KEY'],
    result_type="markdown",  # "markdown" and "text" are available
    parsing_instruction=""" 
    I only need table contents with years as column names and economic indicators as row names
    """,
    num_workers=4,  # if multiple files passed, split in `num_workers` API calls
    verbose=True,
    language="en",  # Optionally you can define a language, default=en
)

In [16]:
documents = instruct_parser.load_data(docs[0])

Started parsing the file under job_id 50bf3469-6f18-4325-bafe-551683114513
.......

In [17]:
print(documents[0].text)

# UNITED STATES

# Table 1. United States: Selected Economic Indicators, 2020-29

# (Percentage change from previous period, unless otherwise indicated)

|Economic Indicators|2020|2021|2022|2023|2024|2025|2026|2027|2028|2029|
|---|---|---|---|---|---|---|---|---|---|---|
|National Production and Income| | | | | | | | | | |
|Real GDP|-2.2|5.8|1.9|2.5|2.6|1.9|2.0|2.1|2.1|2.1|
|Real GDP (q4/q4)|-1.1|5.4|0.7|3.1|2.0|1.8|2.1|2.1|2.1|2.1|
|Net exports 1/|-0.2|-1.3|-0.5|0.6|-0.2|0.0|0.1|0.2|0.2|0.2|
|Total domestic demand|-1.9|6.9|2.3|1.9|2.6|1.8|1.8|1.9|1.9|1.9|
|Final domestic demand|-1.5|6.6|1.7|2.3|2.7|1.8|1.8|1.9|1.9|1.9|
|Private final consumption|-2.5|8.4|2.5|2.2|2.3|1.5|1.4|1.6|1.6|1.6|
|Public consumption expenditure|2.9|0.3|-0.9|2.7|2.0|1.5|1.3|1.3|1.3|1.3|
|Gross fixed domestic investment|-1.0|5.3|0.9|2.1|4.5|3.1|3.6|3.2|3.1|3.1|
|Private fixed investment|-2.1|7.1|1.3|0.6|4.0|2.9|3.7|3.8|3.8|3.8|
|Public fixed investment|4.3|-2.8|-1.1|9.4|7.1|3.9|3.4|0.5|0.0|0.0|
|Change in private

#### Work on batches 

In [26]:
documents = instruct_parser.load_data(docs)

Parsing files: 100%|██████████| 2/2 [01:32<00:00, 46.38s/it]


In [None]:
for document,dp in zip(documents,docs):
    out_name = dp.replace('pdf','md')
    with open(out_name,'w') as f:
        f.write(document.text)