# Data Preparation for Training RAG Agent

Data preparation for PDFs, Docs, CSV, etc. for RAG with LlamaIndex & [LlamaParse](https://github.com/run-llama/llama_cloud_services/blob/main/parse.md).

# Dependencies

In [None]:
%pip install llama-parse

In [None]:
import os
from dotenv import load_dotenv
import nest_asyncio
import gradio as gr
import shutil
from llama_parse import LlamaParse

# Setup

In [None]:
upload_folder = "../content/data_prep/"
uploaded_file = ""

# Patch asyncio to allow nested event loops
nest_asyncio.apply()

load_dotenv()

llama_api_key = os.getenv('LLAMA_CLOUD_API_KEY')

if llama_api_key:
    print(f"Llama API Key exists and begins {llama_api_key[:4]}")
else:
    print("Llama API Key not set")

# Convert PDF document to markdown

Copy documents to be converted to the `/content/data_prep` folder to test out.

In [None]:
document = LlamaParse(api_key=llama_api_key,result_type="markdown").load_data("../content/data_prep/apple_10k.pdf")

In [None]:
document

In [None]:
# check chunks of content
print(document[50].text[:1000])

## Save the text as a markdown file

In [None]:
file_name = "../content/data_prep/apple_10k.md"
with open(file_name, 'w', encoding="utf-8") as file:
  for doc in document:
    file.write(doc.text)

## Make a summary of the document in markdown to remove the fluff so better for LLM

In [None]:
documents_with_instruction = LlamaParse(
    result_type="markdown",
    parsing_instruction="""
    This is the Apple annual report. make a summary
    """
    ).load_data("../content/data_prep/apple_10k.pdf")

In [None]:
file_name = "../content/data_prep/apple_10k_instructions.md"
with open(file_name, 'w') as file:
  for doc in documents_with_instruction:
    file.write(doc.text)

# Functions

In [None]:
def parse_document(parsing_instruction, document, result_format, use_instructions):
    if use_instructions:
        converted_documents = LlamaParse(
            result_type=result_format,
            parsing_instruction=parsing_instruction
            ).load_data(document)
    else:
        converted_documents = LlamaParse(
            api_key=llama_api_key,
            result_type=result_format
        ).load_data(document)
    
    file_name = os.path.basename(document)
    name, extension = os.path.splitext(file_name)
    save_file(name, converted_documents, result_format)

def save_file(name, documents, result_format):
    if result_format == "markdown":
        ext = "md"
    else:
        ext = "txt"
    file_name = f"{upload_folder}{name}.{ext}"
    with open(file_name, 'w', encoding="utf-8") as file:
      for doc in documents:
        file.write(doc.text)
    gr.Info(f"File converted to {result_format} and saved: {file_name}")

# UI

In [None]:
# Gradio code for non-preset Chat interface
sample_instructions="This is the Apple annual report. make a summary"

with gr.Blocks() as ui:
    gr.Markdown("## Prepare Document")
    with gr.Row():
        with gr.Column(scale=3):
            instructions = gr.Textbox(label="Parsing instructions:", value=sample_instructions, lines=5)
            use_instructions = gr.Checkbox(label="Use instructions", info="Uncheck to just convert as is", value=True)
        with gr.Column(scale=1):
            upload_button = gr.UploadButton("Upload File to Convert")
            result_format = gr.Dropdown(["markdown", "text"], label="Select result format", value="markdown")
            parse = gr.Button("Convert Uploaded File")

    parse.click(parse_document, inputs=[instructions, upload_button, result_format, use_instructions], outputs=None)

ui.launch(inbrowser=True)