-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
mozilla_readability.ts
54 lines (48 loc) · 1.53 KB
/
mozilla_readability.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import { Readability } from "@mozilla/readability";
import { JSDOM } from "jsdom";
import type { Options } from "mozilla-readability";
import {
MappingDocumentTransformer,
Document,
} from "@langchain/core/documents";
/**
* A transformer that uses the Mozilla Readability library to extract the
* main content from a web page.
* @example
* ```typescript
* const loader = new CheerioWebBaseLoader("https://example.com/article");
* const docs = await loader.load();
*
* const splitter = new RecursiveCharacterTextSplitter({
* maxCharacterCount: 5000,
* });
* const transformer = new MozillaReadabilityTransformer();
*
* // The sequence processes the loaded documents through the splitter and then the transformer.
* const sequence = splitter.pipe(transformer);
*
* // Invoke the sequence to transform the documents into a more readable format.
* const newDocuments = await sequence.invoke(docs);
*
* console.log(newDocuments);
* ```
*/
export class MozillaReadabilityTransformer extends MappingDocumentTransformer {
static lc_name() {
return "MozillaReadabilityTransformer";
}
constructor(protected options: Options = {}) {
super(options);
}
async _transformDocument(document: Document): Promise<Document> {
const doc = new JSDOM(document.pageContent);
const readability = new Readability(doc.window.document, this.options);
const result = readability.parse();
return new Document({
pageContent: result?.textContent ?? "",
metadata: {
...document.metadata,
},
});
}
}