-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
imsdb.ts
35 lines (32 loc) · 1.33 KB
/
imsdb.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import { Document } from "@langchain/core/documents";
import { CheerioWebBaseLoader } from "./cheerio.js";
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
/* #__PURE__ */ logVersion020MigrationWarning({
oldEntrypointName: "document_loaders/web/imsdb",
newPackageName: "@langchain/community",
});
/**
* A class that extends the CheerioWebBaseLoader class. It represents a
* loader for loading web pages from the IMSDB (Internet Movie Script
* Database) website.
*/
export class IMSDBLoader extends CheerioWebBaseLoader {
constructor(public webPath: string) {
super(webPath);
}
/**
* An asynchronous method that loads the web page using the scrape()
* method inherited from the base class. It selects the element with the
* class 'scrtext' using the $ function provided by Cheerio and extracts
* the text content. It creates a Document instance with the text content
* as the page content and the source as metadata. It returns an array
* containing the Document instance.
* @returns An array containing a Document instance.
*/
public async load(): Promise<Document[]> {
const $ = await this.scrape();
const text = $("td[class='scrtext']").text().trim();
const metadata = { source: this.webPath };
return [new Document({ pageContent: text, metadata })];
}
}