-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
epub.ts
105 lines (95 loc) · 3.14 KB
/
epub.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import type { EPub } from "epub2";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader } from "../base.js";
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
/* #__PURE__ */ logVersion020MigrationWarning({
oldEntrypointName: "document_loaders/fs/epub",
newPackageName: "@langchain/community",
});
/**
* A class that extends the `BaseDocumentLoader` class. It represents a
* document loader that loads documents from EPUB files.
*/
export class EPubLoader extends BaseDocumentLoader {
private splitChapters: boolean;
constructor(public filePath: string, { splitChapters = true } = {}) {
super();
this.splitChapters = splitChapters;
}
/**
* A protected method that takes an EPUB object as a parameter and returns
* a promise that resolves to an array of objects representing the content
* and metadata of each chapter.
* @param epub The EPUB object to parse.
* @returns A promise that resolves to an array of objects representing the content and metadata of each chapter.
*/
protected async parse(
epub: EPub
): Promise<{ pageContent: string; metadata?: object }[]> {
const { htmlToText } = await HtmlToTextImport();
const chapters = await Promise.all(
epub.flow.map(async (chapter) => {
if (!chapter.id) return null as never;
const html: string = await epub.getChapterRawAsync(chapter.id);
if (!html) return null as never;
return {
html,
title: chapter.title,
};
})
);
return chapters.filter(Boolean).map((chapter) => ({
pageContent: htmlToText(chapter.html),
metadata: {
...(chapter.title && { chapter: chapter.title }),
},
}));
}
/**
* A method that loads the EPUB file and returns a promise that resolves
* to an array of `Document` instances.
* @returns A promise that resolves to an array of `Document` instances.
*/
public async load(): Promise<Document[]> {
const { EPub } = await EpubImport();
const epub = await EPub.createAsync(this.filePath);
const parsed = await this.parse(epub);
const metadata = { source: this.filePath };
if (parsed.length === 0) return [];
return this.splitChapters
? parsed.map(
(chapter) =>
new Document({
pageContent: chapter.pageContent,
metadata: {
...metadata,
...chapter.metadata,
},
})
)
: [
new Document({
pageContent: parsed
.map((chapter) => chapter.pageContent)
.join("\n\n"),
metadata,
}),
];
}
}
async function EpubImport() {
const { EPub } = await import("epub2").catch(() => {
throw new Error(
"Failed to load epub2. Please install it with eg. `npm install epub2`."
);
});
return { EPub };
}
async function HtmlToTextImport() {
const { htmlToText } = await import("html-to-text").catch(() => {
throw new Error(
"Failed to load html-to-text. Please install it with eg. `npm install html-to-text`."
);
});
return { htmlToText };
}