-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract-article.js
128 lines (106 loc) · 2.77 KB
/
extract-article.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import { extract, addTransformations } from "@extractus/article-extractor";
import { convert } from "html-to-text";
import cheerio from "cheerio";
import fsPromises from "fs/promises";
import fs from "fs";
function convertHtmlToText(html) {
const text = convert(html, {
wordwrap: 0,
selectors: [
{
selector: "h1",
options: {
uppercase: false,
},
},
{
selector: "h2",
options: {
uppercase: false,
},
},
{
selector: "h3",
options: {
uppercase: false,
},
},
{
selector: "a",
options: {
ignoreHref: true,
},
},
],
});
return text;
}
export async function extractArticle(url) {
addTransformations({
patterns: [/([\w]+.)?wikipedia.org\/*/],
// */
pre: (document) => {
// do something with document
const selectorsToRemove = [
"figure",
"img",
"figcaption",
"sup.reference",
"sup.noprint",
"div.thumb",
"table.infobox",
"ol.references",
".mw-editsection",
];
selectorsToRemove.forEach((selector) => {
document.querySelectorAll(selector).forEach((elem) => {
elem.parentNode.removeChild(elem);
});
});
return document;
},
post: (document) => {
// do something with document
return document;
},
});
const article = await extract(url);
const articleTitle = article.title;
const articleHtml = article.content;
const tmpHtmlPath = "./tmp-html.html";
await fsPromises.writeFile(tmpHtmlPath, articleHtml);
const $ = cheerio.load(articleHtml);
$("div").each((_, divTag) => {
$(divTag).replaceWith($(divTag).contents());
});
const numberOfH1s = $("h1").length;
const chapterHeadingSelector = numberOfH1s > 1 ? "h1" : "h2";
console.log({ chapterHeadingSelector });
const firstTagIsHeading = $("body")
.children()
.first()
.is(chapterHeadingSelector);
if (!firstTagIsHeading) {
$("body").prepend(
`<${chapterHeadingSelector}>${articleTitle}</${chapterHeadingSelector}>`
);
}
const chapters = $(chapterHeadingSelector)
.map((headingIndex, headingTag) => {
const contentTags = $(headingTag)
.nextUntil(chapterHeadingSelector)
.addBack();
const contentHtml = $.html(contentTags).trim();
const tmpChapterPath = `./tmp-html-chapter-${headingIndex}.html`;
fs.writeFileSync(tmpChapterPath, contentHtml);
const chapterTitle = $(headingTag).text().trim();
const chapterText = convertHtmlToText(contentHtml);
return {
title: chapterTitle,
text: chapterText,
};
})
.get();
console.log(chapters);
return chapters;
}