-
Notifications
You must be signed in to change notification settings - Fork 2.1k
/
playwright.ts
129 lines (111 loc) Β· 3.92 KB
/
playwright.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import type { LaunchOptions, Page, Browser, Response } from "playwright";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader } from "../base.js";
import type { DocumentLoader } from "../base.js";
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
/* #__PURE__ */ logVersion020MigrationWarning({
oldEntrypointName: "document_loaders/web/playwright",
newPackageName: "@langchain/community",
});
export { Page, Browser, Response };
export type PlaywrightGotoOptions = {
referer?: string;
timeout?: number;
waitUntil?: "load" | "domcontentloaded" | "networkidle" | "commit";
};
/**
* @deprecated - Import from "@langchain/community/document_loaders/web/playwright" instead. This entrypoint will be removed in 0.3.0.
*
* Type representing a function for evaluating JavaScript code on a web
* page using Playwright. Takes a Page, Browser, and Response object as
* parameters and returns a Promise that resolves to a string.
*/
export type PlaywrightEvaluate = (
page: Page,
browser: Browser,
response: Response | null
) => Promise<string>;
export type PlaywrightWebBaseLoaderOptions = {
launchOptions?: LaunchOptions;
gotoOptions?: PlaywrightGotoOptions;
evaluate?: PlaywrightEvaluate;
};
/**
* @deprecated - Import from "@langchain/community/document_loaders/web/pplaywrightdf" instead. This entrypoint will be removed in 0.3.0.
*
* Class representing a document loader for scraping web pages using
* Playwright. Extends the BaseDocumentLoader class and implements the
* DocumentLoader interface.
*/
export class PlaywrightWebBaseLoader
extends BaseDocumentLoader
implements DocumentLoader
{
options: PlaywrightWebBaseLoaderOptions | undefined;
constructor(
public webPath: string,
options?: PlaywrightWebBaseLoaderOptions
) {
super();
this.options = options ?? undefined;
}
static async _scrape(
url: string,
options?: PlaywrightWebBaseLoaderOptions
): Promise<string> {
const { chromium } = await PlaywrightWebBaseLoader.imports();
const browser = await chromium.launch({
headless: true,
...options?.launchOptions,
});
const page = await browser.newPage();
const response = await page.goto(url, {
timeout: 180000,
waitUntil: "domcontentloaded",
...options?.gotoOptions,
});
const bodyHTML = options?.evaluate
? await options?.evaluate(page, browser, response)
: await page.content();
await browser.close();
return bodyHTML;
}
/**
* Method that calls the _scrape method to perform the scraping of the web
* page specified by the webPath property. Returns a Promise that resolves
* to the scraped HTML content of the web page.
* @returns Promise that resolves to the scraped HTML content of the web page.
*/
async scrape(): Promise<string> {
return PlaywrightWebBaseLoader._scrape(this.webPath, this.options);
}
/**
* Method that calls the scrape method and returns the scraped HTML
* content as a Document object. Returns a Promise that resolves to an
* array of Document objects.
* @returns Promise that resolves to an array of Document objects.
*/
async load(): Promise<Document[]> {
const text = await this.scrape();
const metadata = { source: this.webPath };
return [new Document({ pageContent: text, metadata })];
}
/**
* Static method that imports the necessary Playwright modules. Returns a
* Promise that resolves to an object containing the imported modules.
* @returns Promise that resolves to an object containing the imported modules.
*/
static async imports(): Promise<{
chromium: typeof import("playwright").chromium;
}> {
try {
const { chromium } = await import("playwright");
return { chromium };
} catch (e) {
console.error(e);
throw new Error(
"Please install playwright as a dependency with, e.g. `yarn add playwright`"
);
}
}
}