-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
puppeteer.ts
177 lines (156 loc) Β· 5.04 KB
/
puppeteer.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import type {
launch,
WaitForOptions,
Page,
Browser,
PuppeteerLaunchOptions,
} from "puppeteer";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader } from "../base.js";
import type { DocumentLoader } from "../base.js";
export { Page, Browser };
export type PuppeteerGotoOptions = WaitForOptions & {
referer?: string;
referrerPolicy?: string;
};
/**
* Type representing a function for evaluating JavaScript code on a web
* page using Puppeteer. It takes a Page and Browser object as parameters
* and returns a Promise that resolves to a string.
*/
export type PuppeteerEvaluate = (
page: Page,
browser: Browser
) => Promise<string>;
export type PuppeteerWebBaseLoaderOptions = {
launchOptions?: PuppeteerLaunchOptions;
gotoOptions?: PuppeteerGotoOptions;
evaluate?: PuppeteerEvaluate;
};
/**
* Class that extends the BaseDocumentLoader class and implements the
* DocumentLoader interface. It represents a document loader for scraping
* web pages using Puppeteer.
* @example
* ```typescript
* const loader = new PuppeteerWebBaseLoader("https:exampleurl.com", {
* launchOptions: {
* headless: true,
* },
* gotoOptions: {
* waitUntil: "domcontentloaded",
* },
* });
* const screenshot = await loader.screenshot();
* ```
*/
export class PuppeteerWebBaseLoader
extends BaseDocumentLoader
implements DocumentLoader
{
options: PuppeteerWebBaseLoaderOptions | undefined;
constructor(public webPath: string, options?: PuppeteerWebBaseLoaderOptions) {
super();
this.options = options ?? undefined;
}
static async _scrape(
url: string,
options?: PuppeteerWebBaseLoaderOptions
): Promise<string> {
const { launch } = await PuppeteerWebBaseLoader.imports();
const browser = await launch({
headless: true,
defaultViewport: null,
ignoreDefaultArgs: ["--disable-extensions"],
...options?.launchOptions,
});
const page = await browser.newPage();
await page.goto(url, {
timeout: 180000,
waitUntil: "domcontentloaded",
...options?.gotoOptions,
});
const bodyHTML = options?.evaluate
? await options?.evaluate(page, browser)
: await page.evaluate(() => document.body.innerHTML);
await browser.close();
return bodyHTML;
}
/**
* Method that calls the _scrape method to perform the scraping of the web
* page specified by the webPath property.
* @returns Promise that resolves to the scraped HTML content of the web page.
*/
async scrape(): Promise<string> {
return PuppeteerWebBaseLoader._scrape(this.webPath, this.options);
}
/**
* Method that calls the scrape method and returns the scraped HTML
* content as a Document object.
* @returns Promise that resolves to an array of Document objects.
*/
async load(): Promise<Document[]> {
const text = await this.scrape();
const metadata = { source: this.webPath };
return [new Document({ pageContent: text, metadata })];
}
/**
* Static class method used to screenshot a web page and return
* it as a {@link Document} object where the pageContent property
* is the screenshot encoded in base64.
*
* @param {string} url
* @param {PuppeteerWebBaseLoaderOptions} options
* @returns {Document} A document object containing the screenshot of the page encoded in base64.
*/
static async _screenshot(
url: string,
options?: PuppeteerWebBaseLoaderOptions
): Promise<Document> {
const { launch } = await PuppeteerWebBaseLoader.imports();
const browser = await launch({
headless: true,
defaultViewport: null,
ignoreDefaultArgs: ["--disable-extensions"],
...options?.launchOptions,
});
const page = await browser.newPage();
await page.goto(url, {
timeout: 180000,
waitUntil: "domcontentloaded",
...options?.gotoOptions,
});
const screenshot = await page.screenshot();
const base64 = screenshot.toString("base64");
const metadata = { source: url };
return new Document({ pageContent: base64, metadata });
}
/**
* Screenshot a web page and return it as a {@link Document} object where
* the pageContent property is the screenshot encoded in base64.
*
* @returns {Promise<Document>} A document object containing the screenshot of the page encoded in base64.
*/
async screenshot(): Promise<Document> {
return PuppeteerWebBaseLoader._screenshot(this.webPath, this.options);
}
/**
* Static method that imports the necessary Puppeteer modules. It returns
* a Promise that resolves to an object containing the imported modules.
* @returns Promise that resolves to an object containing the imported Puppeteer modules.
*/
static async imports(): Promise<{
launch: typeof launch;
}> {
try {
// eslint-disable-next-line import/no-extraneous-dependencies
const { launch } = await import("puppeteer");
return { launch };
} catch (e) {
console.error(e);
throw new Error(
"Please install puppeteer as a dependency with, e.g. `yarn add puppeteer`"
);
}
}
}