In [None]:
### Binary Files

Besides raw text data, you may wish to extract information from other file types such as powerpoints or pdfs. To do so, we can use the Blob object along with a PDF parser.

A code example is below:

from langchain.document_loaders.parsers import PDFMinerParser
from langchain_community.document_loaders import Blob

with open(r"./test.pdf", "rb") as fp:
    file_data = fp.read()

blob = Blob.from_data(
    data=file_data,
    mime_type="application/pdf",
)

parser = PDFMinerParser()
parsed_data = parser.parse(blob=blob)
text = parsed_data[0].page_content.strip()

runnable = create_structured_output_runnable(People, llm, mode="openai-tools")
runnable.invoke(text)

### Mime Type Based Parsing

In addition, you can choose to not specify the file and parser type directly, instead letting the code infer it from the file it gets passed.

A code example of this is below:

import magic
from langchain.document_loaders.parsers import BS4HTMLParser, PDFMinerParser
from langchain.document_loaders.parsers.generic import MimeTypeBasedParser
from langchain.document_loaders.parsers.txt import TextParser

HANDLERS = {
    "application/pdf": PDFMinerParser(),
    "text/plain": TextParser(),
    "text/html": BS4HTMLParser(),
}

MIMETYPE_BASED_PARSER = MimeTypeBasedParser(
    handlers=HANDLERS,
    fallback_parser=None,
)

with open(r"./test.html", "rb") as fp:
    file_data = fp.read()

mime = magic.Magic(mime=True)
mime_type = mime.from_buffer(file_data)

blob = Blob.from_data(
    data=file_data,
    mime_type=mime_type,
)

parser = HANDLERS[mime_type]
parsed_data = parser.parse(blob=blob)
runnable.invoke(parsed_data[0].page_content)