-
Notifications
You must be signed in to change notification settings - Fork 2.2k
/
hn.ts
84 lines (79 loc) · 3.14 KB
/
hn.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import type { CheerioAPI } from "cheerio";
import { Document } from "@langchain/core/documents";
import { CheerioWebBaseLoader } from "./cheerio.js";
import { logVersion020MigrationWarning } from "../../util/entrypoint_deprecation.js";
/* #__PURE__ */ logVersion020MigrationWarning({
oldEntrypointName: "document_loaders/web/hn",
newPackageName: "@langchain/community",
});
/**
* A class that extends the CheerioWebBaseLoader class. It represents a
* loader for loading web pages from the Hacker News website.
*/
export class HNLoader extends CheerioWebBaseLoader {
constructor(public webPath: string) {
super(webPath);
}
/**
* An asynchronous method that loads the web page. If the webPath includes
* "item", it calls the loadComments() method to load the comments from
* the web page. Otherwise, it calls the loadResults() method to load the
* results from the web page.
* @returns A Promise that resolves to an array of Document instances.
*/
public async load(): Promise<Document[]> {
const $ = await this.scrape();
if (this.webPath.includes("item")) {
return this.loadComments($);
}
return this.loadResults($);
}
/**
* A private method that loads the comments from the web page. It selects
* the elements with the class "athing comtr" using the $ function
* provided by Cheerio. It also extracts the title of the web page from
* the element with the id "pagespace". It creates Document instances for
* each comment, with the comment text as the page content and the source
* and title as metadata.
* @param $ A CheerioAPI instance.
* @returns An array of Document instances.
*/
private loadComments($: CheerioAPI): Document[] {
const comments = $("tr[class='athing comtr']");
const title = $("tr[id='pagespace']").attr("title");
const documents: Document[] = [];
comments.each((_index, comment) => {
const text = $(comment).text().trim();
const metadata = { source: this.webPath, title };
documents.push(new Document({ pageContent: text, metadata }));
});
return documents;
}
/**
* A private method that loads the results from the web page. It selects
* the elements with the class "athing" using the $ function provided by
* Cheerio. It extracts the ranking, link, title, and other metadata from
* each result item. It creates Document instances for each result item,
* with the title as the page content and the source, title, link, and
* ranking as metadata.
* @param $ A CheerioAPI instance.
* @returns An array of Document instances.
*/
private loadResults($: CheerioAPI): Document[] {
const items = $("tr[class='athing']");
const documents: Document[] = [];
items.each((_index, item) => {
const ranking = $(item).find("span[class='rank']").text();
const link = $(item).find("span[class='titleline'] a").attr("href");
const title = $(item).find("span[class='titleline']").text().trim();
const metadata = {
source: this.webPath,
title,
link,
ranking,
};
documents.push(new Document({ pageContent: title, metadata }));
});
return documents;
}
}