Permalink
Browse files

Return confidence level when retieving summary

  • Loading branch information...
1 parent 7aac0f0 commit cdd30f625eaedbaf47e11385666199245f31a309 Jerry Charumilind committed Jul 5, 2011
Showing with 14 additions and 1 deletion.
  1. +14 −1 readability/readability.py
View
@@ -59,6 +59,17 @@ def text_length(i):
class Unparseable(ValueError):
pass
+class Summary:
+ '''
+ The type of object returned by Document.summary(). This includes the
+ confidence level we have in our summary. If this is low (<35), our summary
+ may not be valid, though we did our best.
+ '''
+
+ def __init__(self, confidence, html):
+ self.confidence = confidence
+ self.html = html
+
class Document:
TEXT_LENGTH_THRESHOLD = 25
RETRY_LENGTH = 250
@@ -111,6 +122,7 @@ def summary(self):
best_candidate = self.select_best_candidate(candidates)
if best_candidate:
+ confidence = best_candidate['content_score']
article = self.get_article(candidates, best_candidate)
else:
if ruthless:
@@ -121,6 +133,7 @@ def summary(self):
continue
else:
logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
+ confidence = 0;
article = self.html.find('body') or self.html
cleaned_article = self.sanitize(article, candidates)
@@ -129,7 +142,7 @@ def summary(self):
ruthless = False
continue # try again
else:
- return cleaned_article
+ return Summary(confidence, cleaned_article)
except StandardError, e:
#logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
logging.exception('error getting summary: ' )

0 comments on commit cdd30f6

Please sign in to comment.