-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
docx.ts
55 lines (50 loc) · 1.7 KB
/
docx.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import { Document } from "@langchain/core/documents";
import { BufferLoader } from "./buffer.js";
/**
* A class that extends the `BufferLoader` class. It represents a document
* loader that loads documents from DOCX files.
*/
export class DocxLoader extends BufferLoader {
constructor(filePathOrBlob: string | Blob) {
super(filePathOrBlob);
}
/**
* A method that takes a `raw` buffer and `metadata` as parameters and
* returns a promise that resolves to an array of `Document` instances. It
* uses the `extractRawText` function from the `mammoth` module to extract
* the raw text content from the buffer. If the extracted text content is
* empty, it returns an empty array. Otherwise, it creates a new
* `Document` instance with the extracted text content and the provided
* metadata, and returns it as an array.
* @param raw The raw buffer from which to extract text content.
* @param metadata The metadata to be associated with the created `Document` instance.
* @returns A promise that resolves to an array of `Document` instances.
*/
public async parse(
raw: Buffer,
metadata: Document["metadata"]
): Promise<Document[]> {
const { extractRawText } = await DocxLoaderImports();
const docx = await extractRawText({
buffer: raw,
});
if (!docx.value) return [];
return [
new Document({
pageContent: docx.value,
metadata,
}),
];
}
}
async function DocxLoaderImports() {
try {
const { extractRawText } = await import("mammoth");
return { extractRawText };
} catch (e) {
console.error(e);
throw new Error(
"Failed to load mammoth. Please install it with eg. `npm install mammoth`."
);
}
}