Skip to content

Commit

Permalink
Included nodes with text in html of article.parse()
Browse files Browse the repository at this point in the history
  • Loading branch information
jecarr committed May 10, 2021
1 parent 1764420 commit 7887cd3
Showing 1 changed file with 12 additions and 6 deletions.
18 changes: 12 additions & 6 deletions newspaper/outputformatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,11 @@ def get_formatted(self, top_node, extra_nodes=[]):
self.replace_with_text()
self.remove_empty_tags()
self.remove_trailing_media_div()
text = self.convert_to_text(extra_nodes)
text, html = self.convert_to_text(extra_nodes, html)
# print(self.parser.nodeToString(self.get_top_node()))
return (text, html)

def convert_to_text(self, extra_nodes):
def convert_to_text(self, extra_nodes, html_to_update):
# The current list of texts to be used for a final combined, joined text
txts = []

Expand All @@ -88,15 +88,21 @@ def _update_text_list(txt):
# For each additional node we have...
for extra in extra_nodes:
# if its text is not in the final text and it does not have a high link density...
if extra.text not in candidate_text and not self.extractor.is_highlink_density(extra):
if extra.text is not None and extra.text in candidate_text \
and not self.extractor.is_highlink_density(extra):
# Parse any hyperlinks and include in final text
self.parser.stripTags(extra, 'a')
_update_text_list(extra.text)
# Given this node is added to the text, add its contents to the html if it should be updated
if self.config.keep_article_html:
html_to_update += self.convert_to_html(extra)
# Return final string based on txts list
return '\n\n'.join(txts)
return '\n\n'.join(txts), html_to_update

def convert_to_html(self):
cleaned_node = self.parser.clean_article_html(self.get_top_node())
def convert_to_html(self, node=None):
if node is None:
node = self.get_top_node()
cleaned_node = self.parser.clean_article_html(node)
return self.parser.nodeToString(cleaned_node)

def add_newline_to_br(self):
Expand Down

0 comments on commit 7887cd3

Please sign in to comment.