-
Notifications
You must be signed in to change notification settings - Fork 0
/
util.js
96 lines (76 loc) · 2.55 KB
/
util.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
let cheerio = require("cheerio");
let request = require("request");
function scrape(url, callback) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
if (error) {
reject(error);
}
else {
resolve({
response: response,
data: html
});
}
});
});
}
function valid_links(html) {
return new Promise(function(resolve, reject) {
let dom = cheerio.load(html);
let article = dom("#mw-content-text");
let anchors = [];
let left_paren_count = 0;
let right_paren_count = 0;
if (article.find(".noarticletext").length) {
return reject("Article doesn't exist!");
}
article.find(".infobox, .metadata, .tright, >table").remove();
let paragraphs = article.find("p").get();
// Links are only valid if they are not within a pair of parens. It turns out
// that `html.replace(/(\([^)]+\))/g)` is a bit aggressive, so instead this
// counts the number of each open/close paren and keeps track of how many
// are currently open for each paragraph. If the count is 0, we know that
// the link is in a valid position.
paragraphs.forEach(function(p) {
let bracket_count = 0;
p.childNodes.forEach(function(node) {
if (bracket_count === 0 && node.type === "tag" && node.name === "a") {
// links to other sites don't count
if (!node.attribs.class || node.attribs.class.split(" ").indexOf("extiw") === -1) {
anchors.push(node);
}
}
if (node.type === "text") {
let left_parens = node.data.match(/\(/g);
let right_parens = node.data.match(/\)/g);
left_paren_count = left_parens ? left_parens.length : 0;
right_paren_count = right_parens ? right_parens.length : 0;
bracket_count = bracket_count + left_paren_count - right_paren_count;
}
});
resolve(anchors);
});
});
}
function get_next_topic(anchors, previsited_anchors) {
return new Promise(function (resolve, reject) {
if (anchors.length === 0) {
reject("Dead end :(");
return;
}
else {
let next_anchor = anchors[0];
let url = next_anchor.attribs.href.replace(/^\/wiki\//, "");
if (previsited_anchors.indexOf(url) !== -1) {
reject("Stuck in a loop - there's no Philosophy to be found here");
return;
}
resolve({
title: next_anchor.children[0].data,
url: url
});
}
});
}
module.exports = { get_next_topic, scrape, valid_links }