From 432567b70c43e4af803700b16d9a6567c03c714a Mon Sep 17 00:00:00 2001 From: Nuno Campos Date: Wed, 5 Apr 2023 08:52:43 +0100 Subject: [PATCH] Allow passing a custom pdfjs build --- .../document_loaders/examples/file_loaders/pdf.md | 13 +++++++++++++ langchain/src/document_loaders/pdf.ts | 10 ++++++++-- langchain/src/document_loaders/tests/pdf.test.ts | 15 +++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/docs/docs/modules/indexes/document_loaders/examples/file_loaders/pdf.md b/docs/docs/modules/indexes/document_loaders/examples/file_loaders/pdf.md index 00cf880e0772..28f4194ebc18 100644 --- a/docs/docs/modules/indexes/document_loaders/examples/file_loaders/pdf.md +++ b/docs/docs/modules/indexes/document_loaders/examples/file_loaders/pdf.md @@ -33,3 +33,16 @@ const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", { const docs = await loader.load(); ``` + +# Usage, legacy environments + +In legacy environments, you can use the `pdfjs` option to provide a function that returns a promise that resolves to the `PDFJS` object. This is useful if you want to use a custom build of `pdfjs-dist` or if you want to use a different version of `pdfjs-dist`. Eg. here we use the legacy build of `pdfjs-dist`, which includes several polyfills that are not included in the default build. + +```typescript +import { PDFLoader } from "langchain/document_loaders"; + +const loader = new PDFLoader("src/document_loaders/example_data/example.pdf", { + pdfjs: () => + import("pdfjs-dist/legacy/build/pdf.js").then((mod) => mod.default), +}); +``` diff --git a/langchain/src/document_loaders/pdf.ts b/langchain/src/document_loaders/pdf.ts index c796ae1392cf..bd14bd32d896 100644 --- a/langchain/src/document_loaders/pdf.ts +++ b/langchain/src/document_loaders/pdf.ts @@ -5,16 +5,22 @@ import { BufferLoader } from "./buffer.js"; export class PDFLoader extends BufferLoader { private splitPages: boolean; - constructor(filePathOrBlob: string | Blob, { splitPages = true } = {}) { + private pdfjs: typeof PDFLoaderImports; + + constructor( + filePathOrBlob: string | Blob, + { splitPages = true, pdfjs = PDFLoaderImports } = {} + ) { super(filePathOrBlob); this.splitPages = splitPages; + this.pdfjs = pdfjs; } public async parse( raw: Buffer, metadata: Document["metadata"] ): Promise { - const { getDocument, version } = await PDFLoaderImports(); + const { getDocument, version } = await this.pdfjs(); const pdf = await getDocument({ data: new Uint8Array(raw.buffer), useWorkerFetch: false, diff --git a/langchain/src/document_loaders/tests/pdf.test.ts b/langchain/src/document_loaders/tests/pdf.test.ts index 93723f7a556a..cddce05e2cca 100644 --- a/langchain/src/document_loaders/tests/pdf.test.ts +++ b/langchain/src/document_loaders/tests/pdf.test.ts @@ -26,3 +26,18 @@ test("Test PDF loader from file to single document", async () => { expect(docs.length).toBe(1); expect(docs[0].pageContent).toContain("Attention Is All You Need"); }); + +test("Test PDF loader from file using custom pdfjs", async () => { + const filePath = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/1706.03762.pdf" + ); + const loader = new PDFLoader(filePath, { + pdfjs: () => + import("pdfjs-dist/legacy/build/pdf.js").then((mod) => mod.default), + }); + const docs = await loader.load(); + + expect(docs.length).toBe(15); + expect(docs[0].pageContent).toContain("Attention Is All You Need"); +});