#### Libraries

In [24]:
import os
import pickle
from pathlib import Path

from dotenv import load_dotenv
from llama_parse import LlamaParse
from nest_asyncio import apply as nest_asyncio_apply

_ = nest_asyncio_apply(), load_dotenv()
DATA = Path("data/whitepapers/")

#### LlamaParse Setup

In [6]:
parser = LlamaParse(
    api_key=os.getenv("LLAMA_PARSE_API_KEY"),
    result_type="markdown",
    verbose=False,
    language="en",
    num_workers=5
)

#### Parsing whitepapers pdfs for all years

In [19]:
full_documents = []
for i in range(5)[::-1]:
    print(f"Parsing year: 202{i}")
    file_path = [str(DATA / f"202{i}" / file) for file in os.listdir(DATA / f"202{i}")]
    documents = await parser.aload_data(file_path=file_path)
    full_documents.append(documents)
    print(f"Done parsing year: 202{i}")

#### Saving parsed pdfs

In [16]:
unnested_documents = [doc for year in full_documents for doc in year]
with open("data/parsed_docs.pkl", "wb") as f:
    pickle.dump(unnested_documents, f)

#### Inspect results

In [29]:
print(f"Number of total documents parsed: {len(unnested_documents)}")
for idx, year in enumerate(full_documents[::-1]):
    print(f"Number of documents parsed for 202{idx} : {len(year)}")

Number of total documents parsed: 284
Number of documents parsed for 2020 : 48
Number of documents parsed for 2021 : 125
Number of documents parsed for 2022 : 61
Number of documents parsed for 2023 : 43
Number of documents parsed for 2024 : 7


In [32]:
print(unnested_documents[0].text[:10000])

AWS Well-Architected Framework
Migration Lens

Copyright©   2024 Amazon Web Services, Inc. and/or its affiliates. All rights reserved.
---
# Migration Lens

# AWS Well-Architected Framework

Migration Lens: AWS Well-Architected Framework

Copyright © 2024 Amazon Web Services, Inc. and/or its affiliates. All rights reserved.

Amazon's trademarks and trade dress may not be used in connection with any product or service that is not Amazon's, in any manner that is likely to cause confusion among customers, or in any manner that disparages or discredits Amazon. All other trademarks not owned by Amazon are the property of their respective owners, who may or may not be affiliated with, connected to, or sponsored by Amazon.
---
## Migration Lens - AWS Well-Architected Framework

|Abstract and introduction|i|
|---|---|
|Introduction|1|
|Definitions|3|
|Well-Architected terminology|3|
|Migration terminology|3|
|Design principles|6|
|Migration lifecycle|8|
|Assess|8|
|Mobilize|8|
|Migrate and mod

#### Cost

In [41]:
total_parsed_pages = 15_257
free_pages = 7_000
cost = 0.003 * (total_parsed_pages - free_pages)
print(f"Cost of parsing {total_parsed_pages:,} pages with LlamaParse: ${cost:.2f}")

Cost of parsing 15,257 pages with LlamaParse: $24.77
