|
|
@@ -59,6 +59,17 @@ def text_length(i): |
|
|
class Unparseable(ValueError):
|
|
|
pass
|
|
|
|
|
|
+class Summary:
|
|
|
+ '''
|
|
|
+ The type of object returned by Document.summary(). This includes the
|
|
|
+ confidence level we have in our summary. If this is low (<35), our summary
|
|
|
+ may not be valid, though we did our best.
|
|
|
+ '''
|
|
|
+
|
|
|
+ def __init__(self, confidence, html):
|
|
|
+ self.confidence = confidence
|
|
|
+ self.html = html
|
|
|
+
|
|
|
class Document:
|
|
|
TEXT_LENGTH_THRESHOLD = 25
|
|
|
RETRY_LENGTH = 250
|
|
|
@@ -111,6 +122,7 @@ def summary(self): |
|
|
|
|
|
best_candidate = self.select_best_candidate(candidates)
|
|
|
if best_candidate:
|
|
|
+ confidence = best_candidate['content_score']
|
|
|
article = self.get_article(candidates, best_candidate)
|
|
|
else:
|
|
|
if ruthless:
|
|
|
@@ -121,6 +133,7 @@ def summary(self): |
|
|
continue
|
|
|
else:
|
|
|
logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
|
|
|
+ confidence = 0;
|
|
|
article = self.html.find('body') or self.html
|
|
|
|
|
|
cleaned_article = self.sanitize(article, candidates)
|
|
|
@@ -129,7 +142,7 @@ def summary(self): |
|
|
ruthless = False
|
|
|
continue # try again
|
|
|
else:
|
|
|
- return cleaned_article
|
|
|
+ return Summary(confidence, cleaned_article)
|
|
|
except StandardError, e:
|
|
|
#logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
|
|
|
logging.exception('error getting summary: ' )
|
|
|
|
0 comments on commit
cdd30f6