-
Notifications
You must be signed in to change notification settings - Fork 1
/
web_scraper.js
67 lines (54 loc) · 2.16 KB
/
web_scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
// settings for web scraper
async function pageFunction(context) {
// jQuery is handy for finding DOM elements and extracting data from them.
// To use it, make sure to enable the "Inject jQuery" option.
const $ = context.jQuery;
const pageTitle = $('div.article__title').first().text().trim();
const pageDate = $('div.article__info-date').text().trim();
const pageOpener = $('h1.article__second-title').first().text().trim();
//const pageText = $('div.article__text').first().text().trim();
const elements = document.querySelectorAll(".article__text");
var fullText = '';
for (let i=0; i < elements.length; i++) {
fullText = fullText.concat(elements[i].innerText);
}
console.log(fullText);
// Print some information to actor log
context.log.info(`URL: ${context.request.url} TITLE: ${pageTitle} DATE: ${pageDate}`);
// Return an object with the data extracted from the page.
// It will be stored to the resulting dataset.
if((fullText.toLowerCase()).includes('украин')) {
return {
url: context.request.url,
pageTitle,
pageDate,
pageOpener,
fullText
};
}
}
// setting for cheerio scraper
async function pageFunction(context) {
const { $, request, log } = context;
// The "$" property contains the Cheerio object which is useful
// for querying DOM elements and extracting data from them.
const pageTitle = $('div.article__title').first().text();
const pageDate = $('div.article__info-date').text().trim();
const pageOpener = $('h1.article__second-title').first().text().trim();
const fullText = $(".article__text").text();
// The "request" property contains various information about the web page loaded.
const url = request.url;
// Use "log" object to print information to actor log.
log.info('Page scraped', { url, pageTitle, fullText });
// Return an object with the data extracted from the page.
// It will be stored to the resulting dataset.
if((fullText.toLowerCase()).includes('украин')) {
return {
url,
pageTitle,
pageDate,
pageOpener,
fullText
};
}
}