-
Notifications
You must be signed in to change notification settings - Fork 2k
/
pdf.ts
123 lines (108 loc) Β· 3.07 KB
/
pdf.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import { Document } from "../../document.js";
import { BaseDocumentLoader } from "../base.js";
import { formatDocumentsAsString } from "../../util/document.js";
/**
* A document loader for loading data from PDFs.
*/
export class WebPDFLoader extends BaseDocumentLoader {
protected blob: Blob;
protected splitPages = true;
private pdfjs: typeof PDFLoaderImports;
constructor(
blob: Blob,
{ splitPages = true, pdfjs = PDFLoaderImports } = {}
) {
super();
this.blob = blob;
this.splitPages = splitPages ?? this.splitPages;
this.pdfjs = pdfjs;
}
/**
* Loads the contents of the PDF as documents.
* @returns An array of Documents representing the retrieved data.
*/
async load(): Promise<Document[]> {
const { getDocument, version } = await this.pdfjs();
const parsedPdf = await getDocument({
data: new Uint8Array(await this.blob.arrayBuffer()),
useWorkerFetch: false,
isEvalSupported: false,
useSystemFonts: true,
}).promise;
const meta = await parsedPdf.getMetadata().catch(() => null);
const documents: Document[] = [];
for (let i = 1; i <= parsedPdf.numPages; i += 1) {
const page = await parsedPdf.getPage(i);
const content = await page.getTextContent();
if (content.items.length === 0) {
continue;
}
// Eliminate excessive newlines
// Source: https://github.com/albertcui/pdf-parse/blob/7086fc1cc9058545cdf41dd0646d6ae5832c7107/lib/pdf-parse.js#L16
let lastY;
const textItems = [];
for (const item of content.items) {
if ("str" in item) {
if (lastY === item.transform[5] || !lastY) {
textItems.push(item.str);
} else {
textItems.push(`\n${item.str}`);
}
// eslint-disable-next-line prefer-destructuring
lastY = item.transform[5];
}
}
const text = textItems.join(" ");
documents.push(
new Document({
pageContent: text,
metadata: {
pdf: {
version,
info: meta?.info,
metadata: meta?.metadata,
totalPages: parsedPdf.numPages,
},
loc: {
pageNumber: i,
},
},
})
);
}
if (this.splitPages) {
return documents;
}
if (documents.length === 0) {
return [];
}
return [
new Document({
pageContent: formatDocumentsAsString(documents),
metadata: {
pdf: {
version,
info: meta?.info,
metadata: meta?.metadata,
totalPages: parsedPdf.numPages,
},
},
}),
];
return documents;
}
}
async function PDFLoaderImports() {
try {
const { default: mod } = await import(
"pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js"
);
const { getDocument, version } = mod;
return { getDocument, version };
} catch (e) {
console.error(e);
throw new Error(
"Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`."
);
}
}