koreader · Frenzie · Jun 4, 2020 · Jun 3, 2020 · Jun 3, 2020 · Jun 3, 2020
diff --git a/plugins/newsdownloader.koplugin/epubdownloadbackend.lua b/plugins/newsdownloader.koplugin/epubdownloadbackend.lua
@@ -24,6 +24,54 @@ local max_redirects = 5; --prevent infinite redirects
 local TIMEOUT_CODE = "timeout" -- from socket.lua
 local MAXTIME_CODE = "maxtime reached" -- from sink_table_with_maxtime
 
+-- filter HTML using CSS selector
+local function filter(text, element)
+    local htmlparser = require("htmlparser")
+    local root = htmlparser.parse(text, 10000)
+    local filtered = nil
+    local selectors = {
+        "main",
+        "article",
+        "div#main",
+        "#main-article",
+        ".main-content",
+        "#body",
+        "#content",
+        ".content",
+        "div#article",
+        "div.article",
+        "div.post",
+        "div.post-outer",
+        ".l-root",
+        ".content-container",
+        ".StandardArticleBody_body",
+        "div#article-inner",
+        "div#newsstorytext",
+        "div.general",
+        }
+    if element then 
+        table.insert(selectors, 1, element)
+    end
+    for _, sel in ipairs(selectors) do
+       local elements = root:select(sel)
+       if elements then
+           for _, e in ipairs(elements) do
+               filtered = e:getcontent()
+               if filtered then 
+                   break
+               end
+           end
+           if filtered then
+              break 
+           end
+       end
+    end
+    if not filtered then 
+        return text 
+    end
+    return "<!DOCTYPE html><html><head></head><body>" .. filtered .. "</body></html>"
+end
+
 -- Sink that stores into a table, aborting if maxtime has elapsed
 local function sink_table_with_maxtime(t, maxtime)
     -- Start counting as soon as this sink is created
@@ -181,15 +229,14 @@ local ext_to_mimetype = {
     ttf = "application/truetype",
     woff = "application/font-woff",
 }
-
+    
 -- Create an epub file (with possibly images)
-function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message)
+function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, message, filter_enable, filter_element)
     logger.dbg("EpubDownloadBackend:createEpub(", epub_path, ")")
     -- Use Trapper to display progress and ask questions through the UI.
     -- We need to have been Trapper.wrap()'ed for UI to be used, otherwise
     -- Trapper:info() and Trapper:confirm() will just use logger.
     local UI = require("ui/trapper")
-
     -- We may need to build absolute urls for non-absolute links and images urls
     local base_url = socket_url.parse(url)
 
@@ -201,7 +248,7 @@ function EpubDownloadBackend:createEpub(epub_path, html, url, include_images, me
     -- Not sure if this bookid may ever be used by indexing software/calibre, but if it is,
     -- should it changes if content is updated (as now, including the wikipedia revisionId),
     -- or should it stays the same even if revid changes (content of the same book updated).
-
+    if filter_enable then html = filter(html, filter_element) end
     local images = {}
     local seen_images = {}
     local imagenum = 1

diff --git a/plugins/newsdownloader.koplugin/feed_config.lua b/plugins/newsdownloader.koplugin/feed_config.lua
@@ -21,12 +21,19 @@ return {--do NOT change this line
  -- 'include_images=false' - means ignore any images, only download the text (faster download, smaller file sizes)
  -- default value is 'false' (if no 'include_images' entry)
 
+ -- 'enable_filter=true' - means filter a css selector and thus delimit the page to just that (does not apply if download_full_article=false)
+ -- 'enable_filter=false' - means no such filtering and having unlimited document
+ -- default value is 'false'
+
+ -- 'filter_element="name_of_css.element.class" - means to filter the chosen CSS selector, it can be easily picked using a modern web browser
+ -- default value is empty and it uses the default list of common selectors and they are used as fallback if this value is et
+
  -- comment out line ("--" at line start) to stop downloading source
 
 
  -- LIST YOUR FEEDS HERE:
 
- { "http://feeds.reuters.com/Reuters/worldNews?format=xml", limit = 2, download_full_article=true},
+ { "http://feeds.reuters.com/Reuters/worldNews?format=xml", limit = 2, download_full_article=true, include_images=true, enable_filter=true},
 
  { "https://www.pcworld.com/index.rss", limit = 7 , download_full_article=false},
 

diff --git a/plugins/newsdownloader.koplugin/main.lua b/plugins/newsdownloader.koplugin/main.lua
@@ -199,10 +199,12 @@ function NewsDownloader:loadConfigAndProcessFeeds()
         local limit = feed.limit
         local download_full_article = feed.download_full_article == nil or feed.download_full_article
         local include_images = not never_download_images and feed.include_images
+        local enable_filter = feed.enable_filter or feed.enable_filter == nil
+        local filter_element = feed.filter_element or feed.filter_element == nil
         if url and limit then
             local feed_message = T(_("Processing %1/%2:\n%3"), idx, total_feed_entries, BD.url(url))
             UI:info(feed_message)
-            NewsDownloader:processFeedSource(url, tonumber(limit), unsupported_feeds_urls, download_full_article, include_images, feed_message)
+            NewsDownloader:processFeedSource(url, tonumber(limit), unsupported_feeds_urls, download_full_article, include_images, feed_message, enable_filter, filter_element)
         else
             logger.warn('NewsDownloader: invalid feed config entry', feed)
         end
@@ -230,7 +232,7 @@ function NewsDownloader:loadConfigAndProcessFeedsWithUI()
     end)
 end
 
-function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message)
+function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, download_full_article, include_images, message, enable_filter, filter_element)
 
     local ok, response = pcall(function()
         return DownloadBackend:getResponseAsString(url)
@@ -250,11 +252,11 @@ function NewsDownloader:processFeedSource(url, limit, unsupported_feeds_urls, do
 
     if is_atom then
         ok = pcall(function()
-            return self:processAtom(feeds, limit, download_full_article, include_images, message)
+            return self:processAtom(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
         end)
     elseif is_rss then
         ok = pcall(function()
-            return self:processRSS(feeds, limit, download_full_article, include_images, message)
+            return self:processRSS(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
         end)
     end
     if not ok or (not is_rss and not is_atom) then
@@ -280,7 +282,7 @@ function NewsDownloader:deserializeXMLString(xml_str)
     return xmlhandler.root
 end
 
-function NewsDownloader:processAtom(feeds, limit, download_full_article, include_images, message)
+function NewsDownloader:processAtom(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
     local feed_output_dir = string.format("%s%s/",
                                           news_download_dir_path,
                                           util.getSafeFilename(getFeedTitle(feeds.feed.title)))
@@ -294,14 +296,14 @@ function NewsDownloader:processAtom(feeds, limit, download_full_article, include
         end
         local article_message = T(_("%1\n\nFetching article %2/%3:"), message, index, limit == 0 and #feeds.rss.channel.item or limit)
         if download_full_article then
-            self:downloadFeed(feed, feed_output_dir, include_images, article_message)
+            self:downloadFeed(feed, feed_output_dir, include_images, article_message, enable_filter, filter_element)
         else
             self:createFromDescription(feed, feed.content[1], feed_output_dir, include_images, article_message)
         end
     end
 end
 
-function NewsDownloader:processRSS(feeds, limit, download_full_article, include_images, message)
+function NewsDownloader:processRSS(feeds, limit, download_full_article, include_images, message, enable_filter, filter_element)
     local feed_output_dir = ("%s%s/"):format(
         news_download_dir_path, util.getSafeFilename(util.htmlEntitiesToUtf8(feeds.rss.channel.title)))
     if not lfs.attributes(feed_output_dir, "mode") then
@@ -314,7 +316,7 @@ function NewsDownloader:processRSS(feeds, limit, download_full_article, include_
         end
         local article_message = T(_("%1\n\nFetching article %2/%3:"), message, index, limit == 0 and #feeds.rss.channel.item or limit)
         if download_full_article then
-            self:downloadFeed(feed, feed_output_dir, include_images, article_message)
+            self:downloadFeed(feed, feed_output_dir, include_images, article_message, enable_filter, filter_element)
         else
             self:createFromDescription(feed, feed.description, feed_output_dir, include_images, article_message)
         end
@@ -341,7 +343,7 @@ local function getTitleWithDate(feed)
     return title
 end
 
-function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message)
+function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, message, enable_filter, filter_element)
     local title_with_date = getTitleWithDate(feed)
     local news_file_path = ("%s%s%s"):format(feed_output_dir,
                                              title_with_date,
@@ -355,7 +357,7 @@ function NewsDownloader:downloadFeed(feed, feed_output_dir, include_images, mess
         local article_message = T(_("%1\n%2"), message, title_with_date)
         local link = getFeedLink(feed.link)
         local html = DownloadBackend:loadPage(link)
-        DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message)
+        DownloadBackend:createEpub(news_file_path, html, link, include_images, article_message, enable_filter, filter_element)
     end
 end