-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
gitbook.ts
112 lines (99 loc) · 3.82 KB
/
gitbook.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import type { CheerioAPI } from "cheerio";
import { Document } from "@langchain/core/documents";
import { CheerioWebBaseLoader } from "./cheerio.js";
/**
* Interface representing the parameters for configuring the
* GitbookLoader. It has an optional property shouldLoadAllPaths, which
* indicates whether all paths should be loaded.
*/
interface GitbookLoaderParams {
shouldLoadAllPaths?: boolean;
}
/**
* Class representing a document loader specifically designed for loading
* documents from Gitbook. It extends the CheerioWebBaseLoader.
*/
export class GitbookLoader extends CheerioWebBaseLoader {
shouldLoadAllPaths = false;
private readonly baseUrl: string;
constructor(public webPath: string, params: GitbookLoaderParams = {}) {
const path =
params.shouldLoadAllPaths === true ? `${webPath}/sitemap.xml` : webPath;
super(path);
this.baseUrl = webPath;
this.webPath = path;
this.shouldLoadAllPaths =
params.shouldLoadAllPaths ?? this.shouldLoadAllPaths;
}
/**
* Method that scrapes the web document using Cheerio and loads the
* content based on the value of shouldLoadAllPaths. If shouldLoadAllPaths
* is true, it calls the loadAllPaths() method to load all paths.
* Otherwise, it calls the loadPath() method to load a single path.
* @returns Promise resolving to an array of Document instances.
*/
public async load(): Promise<Document[]> {
const $ = await this.scrape();
if (this.shouldLoadAllPaths === true) {
return this.loadAllPaths($);
}
return this.loadPath($);
}
/**
* Private method that loads the content of a single path from the Gitbook
* web document. It extracts the page content by selecting all elements
* inside the "main" element, filters out empty text nodes, and joins the
* remaining text nodes with line breaks. It extracts the title by
* selecting the first "h1" element inside the "main" element. It creates
* a Document instance with the extracted page content and metadata
* containing the source URL and title.
* @param $ CheerioAPI instance representing the loaded web document.
* @param url Optional string representing the URL of the web document.
* @returns Array of Document instances.
*/
private loadPath($: CheerioAPI, url?: string): Document[] {
const pageContent = $("main *")
.contents()
.toArray()
.map((element) =>
element.type === "text" ? $(element).text().trim() : null
)
.filter((text) => text)
.join("\n");
const title = $("main h1").first().text().trim();
return [
new Document({
pageContent,
metadata: { source: url ?? this.webPath, title },
}),
];
}
/**
* Private method that loads the content of all paths from the Gitbook web
* document. It extracts the URLs of all paths from the "loc" elements in
* the sitemap.xml. It iterates over each URL, scrapes the web document
* using the _scrape() method, and calls the loadPath() method to load the
* content of each path. It collects all the loaded documents and returns
* them as an array.
* @param $ CheerioAPI instance representing the loaded web document.
* @returns Promise resolving to an array of Document instances.
*/
private async loadAllPaths($: CheerioAPI): Promise<Document[]> {
const urls = $("loc")
.toArray()
.map((element) => $(element).text());
const documents: Document[] = [];
for (const url of urls) {
const buildUrl = url.includes(this.baseUrl) ? url : this.baseUrl + url;
console.log(`Fetching text from ${buildUrl}`);
const html = await GitbookLoader._scrape(
buildUrl,
this.caller,
this.timeout
);
documents.push(...this.loadPath(html, buildUrl));
}
console.log(`Fetched ${documents.length} documents.`);
return documents;
}
}