-
Notifications
You must be signed in to change notification settings - Fork 2.1k
/
cheerio.ts
127 lines (114 loc) Β· 3.62 KB
/
cheerio.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import type { CheerioAPI, load as LoadT, SelectorType } from "cheerio";
import { Document } from "@langchain/core/documents";
import {
AsyncCaller,
AsyncCallerParams,
} from "@langchain/core/utils/async_caller";
import { BaseDocumentLoader } from "../base.js";
import type { DocumentLoader } from "../base.js";
/**
* Represents the parameters for configuring the CheerioWebBaseLoader. It
* extends the AsyncCallerParams interface and adds additional parameters
* specific to web-based loaders.
*/
export interface WebBaseLoaderParams extends AsyncCallerParams {
/**
* The timeout in milliseconds for the fetch request. Defaults to 10s.
*/
timeout?: number;
/**
* The selector to use to extract the text from the document. Defaults to
* "body".
*/
selector?: SelectorType;
/**
* The text decoder to use to decode the response. Defaults to UTF-8.
*/
textDecoder?: TextDecoder;
}
/**
* A class that extends the BaseDocumentLoader and implements the
* DocumentLoader interface. It represents a document loader for loading
* web-based documents using Cheerio.
* @example
* ```typescript
* const loader = new CheerioWebBaseLoader("https:exampleurl.com");
* const docs = await loader.load();
* console.log({ docs });
* ```
*/
export class CheerioWebBaseLoader
extends BaseDocumentLoader
implements DocumentLoader
{
timeout: number;
caller: AsyncCaller;
selector?: SelectorType;
textDecoder?: TextDecoder;
constructor(public webPath: string, fields?: WebBaseLoaderParams) {
super();
const { timeout, selector, textDecoder, ...rest } = fields ?? {};
this.timeout = timeout ?? 10000;
this.caller = new AsyncCaller(rest);
this.selector = selector ?? "body";
this.textDecoder = textDecoder;
}
static async _scrape(
url: string,
caller: AsyncCaller,
timeout: number | undefined,
textDecoder?: TextDecoder
): Promise<CheerioAPI> {
const { load } = await CheerioWebBaseLoader.imports();
const response = await caller.call(fetch, url, {
signal: timeout ? AbortSignal.timeout(timeout) : undefined,
});
const html =
textDecoder?.decode(await response.arrayBuffer()) ??
(await response.text());
return load(html);
}
/**
* Fetches the web document from the webPath and loads it using Cheerio.
* It returns a CheerioAPI instance.
* @returns A Promise that resolves to a CheerioAPI instance.
*/
async scrape(): Promise<CheerioAPI> {
return CheerioWebBaseLoader._scrape(
this.webPath,
this.caller,
this.timeout,
this.textDecoder
);
}
/**
* Extracts the text content from the loaded document using the selector
* and creates a Document instance with the extracted text and metadata.
* It returns an array of Document instances.
* @returns A Promise that resolves to an array of Document instances.
*/
async load(): Promise<Document[]> {
const $ = await this.scrape();
const text = $(this.selector).text();
const metadata = { source: this.webPath };
return [new Document({ pageContent: text, metadata })];
}
/**
* A static method that dynamically imports the Cheerio library and
* returns the load function. If the import fails, it throws an error.
* @returns A Promise that resolves to an object containing the load function from the Cheerio library.
*/
static async imports(): Promise<{
load: typeof LoadT;
}> {
try {
const { load } = await import("cheerio");
return { load };
} catch (e) {
console.error(e);
throw new Error(
"Please install cheerio as a dependency with, e.g. `yarn add cheerio`"
);
}
}
}