<a href="https://colab.research.google.com/github/keithth/AI_Apps/blob/main/27c_pdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Key & logs

In [None]:
from google.colab import userdata
openai_api_key = userdata.get('OPENAI_API_KEY')

In [None]:

import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

## Installation

In [None]:
# Install the required packages (run this in a Colab cell)
!pip install -qU haystack-ai
!pip install -qU unstructured  # needed by the UnstructuredFileConverter
!pip install -qU pdfminer.six  # for PDF parsing if not already installed


In [None]:
!pip show haystack-ai unstructured pdfminer.six | grep Version | cut -d: -f2

# b1

In [None]:

# Import necessary classes from Haystack 2.x
from haystack import Pipeline, Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.converters import UnstructuredFileConverter
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter


# b3

In [None]:

# Initialize an in-memory document store
document_store = InMemoryDocumentStore()

# Create the PDF converter component.
# UnstructuredFileConverter supports many file types including PDF.
converter = UnstructuredFileConverter()

# Create a document splitter.
# Here we split the extracted text into passages of about 100 words with 20 words overlap.
splitter = DocumentSplitter(split_by="passage", split_length=100, split_overlap=20)

# Create a writer component that writes (indexes) the processed documents into the document store.
writer = DocumentWriter(document_store)

# Build the pipeline.
# In Haystack 2.x, components are added without an intrinsic order – you then connect the output
# of one component to the input of the next.
pipeline = Pipeline()
pipeline.add_component("converter", converter)
pipeline.add_component("splitter", splitter)
pipeline.add_component("writer", writer)

# Connect the pipeline components:
# - The converter produces a list of documents (accessible via "converter.documents")
# - The splitter takes these documents and splits them into smaller passages
# - The writer then writes the resulting documents to the document store.
pipeline.connect("converter.documents", "splitter.documents")
pipeline.connect("splitter.documents", "writer")

# Specify the PDF file(s) to process.
# (Make sure to upload the PDF file to the Colab runtime or provide a valid path.)
file_paths = ["sample.pdf"]  # replace "sample.pdf" with your PDF file path

# Run the pipeline by providing the file paths to the converter.
result = pipeline.run({"converter": {"file_paths": file_paths}})

# Print out the returned result.
# The result will include the documents indexed into the document store.
print(result)

# (Optional) You can now query your document store—for example, to see the indexed documents:
docs = document_store.filter_documents()
print("\nIndexed Documents:")
for d in docs:
    print(d.content[:200] + "...")


# b2