-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
cheerio.ts
158 lines (144 loc) Β· 4.51 KB
/
cheerio.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import type {
CheerioAPI,
CheerioOptions,
load as LoadT,
SelectorType,
} from "cheerio";
import { Document } from "@langchain/core/documents";
import {
AsyncCaller,
AsyncCallerParams,
} from "@langchain/core/utils/async_caller";
import { BaseDocumentLoader } from "../base.js";
import type { DocumentLoader } from "../base.js";
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
/* #__PURE__ */ logVersion020MigrationWarning({
oldEntrypointName: "document_loaders/web/cheerio",
newPackageName: "@langchain/community",
});
/**
* Represents the parameters for configuring the CheerioWebBaseLoader. It
* extends the AsyncCallerParams interface and adds additional parameters
* specific to web-based loaders.
*/
export interface WebBaseLoaderParams extends AsyncCallerParams {
/**
* The timeout in milliseconds for the fetch request. Defaults to 10s.
*/
timeout?: number;
/**
* The selector to use to extract the text from the document. Defaults to
* "body".
*/
selector?: SelectorType;
/**
* The text decoder to use to decode the response. Defaults to UTF-8.
*/
textDecoder?: TextDecoder;
}
/**
* A class that extends the BaseDocumentLoader and implements the
* DocumentLoader interface. It represents a document loader for loading
* web-based documents using Cheerio.
* @example
* ```typescript
* const loader = new CheerioWebBaseLoader("https:exampleurl.com");
* const docs = await loader.load();
* console.log({ docs });
* ```
*/
export class CheerioWebBaseLoader
extends BaseDocumentLoader
implements DocumentLoader
{
timeout: number;
caller: AsyncCaller;
selector?: SelectorType;
textDecoder?: TextDecoder;
constructor(public webPath: string, fields?: WebBaseLoaderParams) {
super();
const { timeout, selector, textDecoder, ...rest } = fields ?? {};
this.timeout = timeout ?? 10000;
this.caller = new AsyncCaller(rest);
this.selector = selector ?? "body";
this.textDecoder = textDecoder;
}
/**
* Fetches web documents from the given array of URLs and loads them using Cheerio.
* It returns an array of CheerioAPI instances.
* @param urls An array of URLs to fetch and load.
* @returns A Promise that resolves to an array of CheerioAPI instances.
*/
static async scrapeAll(
urls: string[],
caller: AsyncCaller,
timeout: number | undefined,
textDecoder?: TextDecoder,
options?: CheerioOptions
): Promise<CheerioAPI[]> {
return Promise.all(
urls.map((url) =>
CheerioWebBaseLoader._scrape(url, caller, timeout, textDecoder, options)
)
);
}
static async _scrape(
url: string,
caller: AsyncCaller,
timeout: number | undefined,
textDecoder?: TextDecoder,
options?: CheerioOptions
): Promise<CheerioAPI> {
const { load } = await CheerioWebBaseLoader.imports();
const response = await caller.call(fetch, url, {
signal: timeout ? AbortSignal.timeout(timeout) : undefined,
});
const html =
textDecoder?.decode(await response.arrayBuffer()) ??
(await response.text());
return load(html, options);
}
/**
* Fetches the web document from the webPath and loads it using Cheerio.
* It returns a CheerioAPI instance.
* @returns A Promise that resolves to a CheerioAPI instance.
*/
async scrape(): Promise<CheerioAPI> {
return CheerioWebBaseLoader._scrape(
this.webPath,
this.caller,
this.timeout,
this.textDecoder
);
}
/**
* Extracts the text content from the loaded document using the selector
* and creates a Document instance with the extracted text and metadata.
* It returns an array of Document instances.
* @returns A Promise that resolves to an array of Document instances.
*/
async load(): Promise<Document[]> {
const $ = await this.scrape();
const text = $(this.selector).text();
const metadata = { source: this.webPath };
return [new Document({ pageContent: text, metadata })];
}
/**
* A static method that dynamically imports the Cheerio library and
* returns the load function. If the import fails, it throws an error.
* @returns A Promise that resolves to an object containing the load function from the Cheerio library.
*/
static async imports(): Promise<{
load: typeof LoadT;
}> {
try {
const { load } = await import("cheerio");
return { load };
} catch (e) {
console.error(e);
throw new Error(
"Please install cheerio as a dependency with, e.g. `yarn add cheerio`"
);
}
}
}