Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NewsDownloader] Added an HTML filter through a CSS selector #6228

Merged
merged 11 commits into from
Jun 4, 2020
55 changes: 51 additions & 4 deletions plugins/newsdownloader.koplugin/epubdownloadbackend.lua
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,54 @@ local max_redirects = 5; --prevent infinite redirects
local TIMEOUT_CODE = "timeout" -- from socket.lua
local MAXTIME_CODE = "maxtime reached" -- from sink_table_with_maxtime

-- filter HTML using CSS selector
local function filter(text, element)
local htmlparser = require("htmlparser")
Frenzie marked this conversation as resolved.
Show resolved Hide resolved
local root = htmlparser.parse(text, 10000)
local filtered = nil
local selectors = {
"main",
"article",
"div#main",
"#main-article",
".main-content",
"#body",
"#content",
".content",
"div#article",
"div.article",
"div.post",
"div.post-outer",
".l-root",
".content-container",
".StandardArticleBody_body",
"div#article-inner",
"div#newsstorytext",
"div.general",
}
if element then
Frenzie marked this conversation as resolved.
Show resolved Hide resolved
table.insert(selectors, 1, element)
hngt marked this conversation as resolved.
Show resolved Hide resolved
end
for _, sel in ipairs(selectors) do
local elements = root:select(sel)
if elements then
for _, e in ipairs(elements) do
filtered = e:getcontent()
if filtered then
Frenzie marked this conversation as resolved.
Show resolved Hide resolved
break
end
end
if filtered then
break
Frenzie marked this conversation as resolved.
Show resolved Hide resolved
end
end
end
if not filtered then
Frenzie marked this conversation as resolved.
Show resolved Hide resolved
return text
Frenzie marked this conversation as resolved.
Show resolved Hide resolved
end
return "<!DOCTYPE html><html><head></head><body>" .. filtered .. "</body></html>"
Frenzie marked this conversation as resolved.
Show resolved Hide resolved
end

-- Sink that stores into a table, aborting if maxtime has elapsed
local function sink_table_with_maxtime(t, maxtime)
-- Start counting as soon as this sink is created
Expand Down Expand Up @@ -181,15 +229,14 @@ local ext_to_mimetype = {
ttf = "application/truetype",
woff = "application/font-woff",
}

Frenzie marked this conversation as resolved.
Show resolved Hide resolved
-- Create an epub file (with possibly images)
function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message)
function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element)
logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")")
-- Use Trapper to display progress and ask questions through the UI.
-- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
-- Trapper:info() and Trapper:confirm() will just use logger.
local UI = require("ui/trapper")

-- We may need to build absolute urls for non-absolute links and images urls
local base_url = socket_url.parse(url)

Expand All @@ -201,7 +248,7 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me
-- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
-- should it changes if content is updated (as now, including the wikipedia revisionId),
-- or should it stays the same even if revid changes (content of the same book updated).

if filter_enable then html = filter(html, filter_element) end
local images = {}
local seen_images = {}
local imagenum = 1
Expand Down
9 changes: 8 additions & 1 deletion plugins/newsdownloader.koplugin/feed_config.lua
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,19 @@ return {--do NOT change this line
-- 'include_images=false' - means ignore any images, only download the text (faster download, smaller file sizes)
-- default value is 'false' (if no 'include_images' entry)

-- 'enable_filter=true' - means filter a css selector and thus delimit the page to just that (does not apply if download_full_article=false)
Frenzie marked this conversation as resolved.
Show resolved Hide resolved
-- 'enable_filter=false' - means no such filtering and having unlimited document
Frenzie marked this conversation as resolved.
Show resolved Hide resolved
-- default value is 'false'

-- 'filter_element="name_of_css.element.class" - means to filter the chosen CSS selector, it can be easily picked using a modern web browser
-- default value is empty and it uses the default list of common selectors and they are used as fallback if this value is et
Frenzie marked this conversation as resolved.
Show resolved Hide resolved

-- comment out line ("--" at line start) to stop downloading source


-- LIST YOUR FEEDS HERE:

{ "http://feeds.reuters.com/Reuters/worldNews?format=xml", limit = 2, download_full_article=true},
{ "http://feeds.reuters.com/Reuters/worldNews?format=xml", limit = 2, download_full_article=true, include_images=true, enable_filter=true},

{ "https://www.pcworld.com/index.rss", limit = 7 , download_full_article=false},

Expand Down
22 changes: 12 additions & 10 deletions plugins/newsdownloader.koplugin/main.lua
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,12 @@ function NewsDownloader:loadConfigAndProcessFeeds()
local limit = feed.limit
local download_full_article = feed.download_full_article == nil or feed.download_full_article
local include_images = not never_download_images and feed.include_images
local enable_filter = feed.enable_filter or feed.enable_filter == nil
local filter_element = feed.filter_element or feed.filter_element == nil
if url and limit then
local feed_message = T(_("Processing %1/%2:\n%3"), idx, total_feed_entries, BD.url(url))
UI:info(feed_message)
NewsDownloader:processFeedSource(url, tonumber(limit), unsupported_feeds_urls, download_full_article, include_images, feed_message)
NewsDownloader:processFeedSource(url, tonumber(limit), unsupported_feeds_urls, download_full_article, include_images, feed_message, enable_filter, filter_element)
else
logger.warn('NewsDownloader: invalid feed config entry', feed)
end
Expand Down Expand Up @@ -230,7 +232,7 @@ function NewsDownloader:loadConfigAndProcessFeedsWithUI()
end)
end

function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message)
function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element)

local ok, response = pcall(function()
return DownloadBackend:getResponseAsString(url)
Expand All @@ -250,11 +252,11 @@ function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, do

if is_atom then
ok = pcall(function()
return self:processAtom(feeds, limit, download_full_article, include_images, message)
return self:processAtom(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
end)
elseif is_rss then
ok = pcall(function()
return self:processRSS(feeds, limit, download_full_article, include_images, message)
return self:processRSS(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
end)
end
if not ok or (not is_rss and not is_atom) then
Expand All @@ -280,7 +282,7 @@ function NewsDownloader:deserializeXMLString(xml_str)
return xmlhandler.root
end

function NewsDownloader:processAtom(feeds, limit, download_full_article, include_images, message)
function NewsDownloader:processAtom(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
local feed_output_dir = string.format("%s%s/",
news_download_dir_path,
util.getSafeFilename(getFeedTitle(feeds.feed.title)))
Expand All @@ -294,14 +296,14 @@ function NewsDownloader:processAtom(feeds, limit, download_full_article, include
end
local article_message = T(_("%1\n\nFetching article %2/%3:"), message, index, limit == 0 and #feeds.rss.channel.item or limit)
if download_full_article then
self:downloadFeed(feed, feed_output_dir, include_images, article_message)
self:downloadFeed(feed, feed_output_dir, include_images, article_message, enable_filter, filter_element)
else
self:createFromDescription(feed, feed.content[1], feed_output_dir, include_images, article_message)
end
end
end

function NewsDownloader:processRSS(feeds, limit, download_full_article, include_images, message)
function NewsDownloader:processRSS(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
local feed_output_dir = ("%s%s/"):format(
news_download_dir_path, util.getSafeFilename(util.htmlEntitiesToUtf8(feeds.rss.channel.title)))
if not lfs.attributes(feed_output_dir, "mode") then
Expand All @@ -314,7 +316,7 @@ function NewsDownloader:processRSS(feeds, limit, download_full_article, include_
end
local article_message = T(_("%1\n\nFetching article %2/%3:"), message, index, limit == 0 and #feeds.rss.channel.item or limit)
if download_full_article then
self:downloadFeed(feed, feed_output_dir, include_images, article_message)
self:downloadFeed(feed, feed_output_dir, include_images, article_message, enable_filter, filter_element)
else
self:createFromDescription(feed, feed.description, feed_output_dir, include_images, article_message)
end
Expand All @@ -341,7 +343,7 @@ local function getTitleWithDate(feed)
return title
end

function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message)
function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message, enable_filter, filter_element)
local title_with_date = getTitleWithDate(feed)
local news_file_path = ("%s%s%s"):format(feed_output_dir,
title_with_date,
Expand All @@ -355,7 +357,7 @@ function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, mess
local article_message = T(_("%1\n%2"), message, title_with_date)
local link = getFeedLink(feed.link)
local html = DownloadBackend:loadPage(link)
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message)
DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element)
end
end

Expand Down