From 8a0612f8d6eab47c1bc40a253dd67cf14be4f1a2 Mon Sep 17 00:00:00 2001 From: Andrew Cantino Date: Sat, 20 Feb 2010 14:47:49 -0800 Subject: [PATCH] re-ported from version 1.5.0 of readability.js --- README | 4 + lib/readability.rb | 264 ++++++++++++++++++++-- lib/readability_old.rb | 74 ++++++ spec/fixtures/cant_read.html | 426 +++++++++++++++++++++++++++++++++++ spec/readability_spec.rb | 148 +++++++++++- 5 files changed, 891 insertions(+), 25 deletions(-) create mode 100644 lib/readability_old.rb create mode 100644 spec/fixtures/cant_read.html diff --git a/README b/README index 8a97645..7483a8e 100644 --- a/README +++ b/README @@ -1,5 +1,9 @@ +This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0 + This is a ruby port of arc90's readability project http://lab.arc90.com/experiments/readability/ Given a html document, it pulls out the main body text and cleans it up. + +Ruby port by starrhorne and iterationlabs diff --git a/lib/readability.rb b/lib/readability.rb index e137d9d..c3d4679 100644 --- a/lib/readability.rb +++ b/lib/readability.rb @@ -3,48 +3,264 @@ module Readability class Document + TEXT_LENGTH_THRESHOLD = 25 + RETRY_LENGTH = 250 + + attr_accessor :options, :html def initialize(input, options = {}) + @input = input @options = options - @html = Nokogiri::HTML(input, nil, 'UTF-8') + make_html + end + + def make_html + @html = Nokogiri::HTML(@input, nil, 'UTF-8') end + REGEXES = { + :unlikelyCandidatesRe => /combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor/i, + :okMaybeItsACandidateRe => /and|article|body|column|main/i, + :positiveRe => /article|body|content|entry|hentry|page|pagination|post|text/i, + :negativeRe => /combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget/i, + :divToPElementsRe => /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i, + :replaceBrsRe => /(]*>[ \n\r\t]*){2,}/i, + :replaceFontsRe => /<(\/?)font[^>]*>/i, + :trimRe => /^\s+|\s+$/, + :normalizeRe => /\s{2,}/, + :killBreaksRe => /((\s| ?)*){1,}/, + :videoRe => /http:\/\/(www\.)?(youtube|vimeo)\.com/i + } + + def content(remove_unlikely_candidates = true) + @html.css("script, style").each { |i| i.remove } + + remove_unlikely_candidates! if remove_unlikely_candidates + transform_misused_divs_into_paragraphs! + candidates = score_paragraphs(options[:min_text_length] || TEXT_LENGTH_THRESHOLD) + best_candidate = select_best_candidate(candidates) + article = get_article(candidates, best_candidate) + + cleaned_article = sanitize(article, candidates, options) + if remove_unlikely_candidates && article.text.strip.length < (options[:retry_length] || RETRY_LENGTH) + make_html + content(false) + else + cleaned_article + end + end + + def get_article(candidates, best_candidate) + # Now that we have the top candidate, look through its siblings for content that might also be related. + # Things like preambles, content split by ads that we removed, etc. + + sibling_score_threshold = [10, best_candidate[:content_score] * 0.2].max + output = Nokogiri::XML::Node.new('div', @html) + best_candidate[:elem].parent.children.each do |sibling| + append = false + append = true if sibling == best_candidate[:elem] + append = true if candidates[sibling] && candidates[sibling][:content_score] >= sibling_score_threshold + + if sibling.name.downcase == "p" + link_density = get_link_density(sibling) + node_content = sibling.text + node_length = node_content.length + + if node_length > 80 && link_density < 0.25 + append = true + elsif node_length < 80 && link_density == 0 && node_content =~ /\.( |$)/ + append = true + end + end + + if append + sibling.name = "div" unless %w[div p].include?(sibling.name.downcase) + output << sibling + end + end + + output + end + + def select_best_candidate(candidates) + sorted_candidates = candidates.values.sort { |a, b| b[:content_score] <=> a[:content_score] } + + debug("Top 5 canidates:") + sorted_candidates[0...5].each do |candidate| + debug("Candidate #{candidate[:elem].name}##{candidate[:elem][:id]}.#{candidate[:elem][:class]} with score #{candidate[:content_score]}") + end + + best_candidate = sorted_candidates.first || { :elem => @html.css("body").first, :content_score => 0 } + debug("Best candidate #{best_candidate[:elem].name}##{best_candidate[:elem][:id]}.#{best_candidate[:elem][:class]} with score #{best_candidate[:content_score]}") + + best_candidate + end + + def get_link_density(elem) + link_length = elem.css("a").map {|i| i.text}.join("").length + text_length = elem.text.length + link_length / text_length.to_f + end + + def score_paragraphs(min_text_length) + candidates = {} + @html.css("p,td").each do |elem| + parent_node = elem.parent + grand_parent_node = parent_node.respond_to?(:parent) ? parent_node.parent : nil + inner_text = elem.text + + # If this paragraph is less than 25 characters, don't even count it. + next if inner_text.length < min_text_length + + candidates[parent_node] ||= score_node(parent_node) + candidates[grand_parent_node] ||= score_node(grand_parent_node) if grand_parent_node - def content + content_score = 1 + content_score += inner_text.split(',').length + content_score += [(inner_text.length / 100).to_i, 3].min - # Get all parent elements containing a

tag - @parents = @html.css("p").map { |p| p.parent }.compact.uniq + candidates[parent_node][:content_score] += content_score + candidates[grand_parent_node][:content_score] += content_score / 2.0 if grand_parent_node + end - sanitize(@parents.map { |p| [p, score(p)] }.max { |a, b| a[1] <=> b[1] }[0]) + # Scale the final candidates score based on link density. Good content should have a + # relatively small link density (5% or less) and be mostly unaffected by this operation. + candidates.each do |elem, candidate| + candidate[:content_score] = candidate[:content_score] * (1 - get_link_density(elem)) + end + candidates end - def score(parent) - s = 0 + def class_weight(e) + weight = 0 + if e[:class] && e[:class] != "" + if e[:class] =~ REGEXES[:negativeRe] + weight -= 25 + end - # Adjust score based on parent's "class" attribute - s -= 50 if parent[:class] =~ /(comment|meta|footer|footnote)/i - s += 25 if parent[:class] =~ /((^|\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\s|$))/i + if e[:class] =~ REGEXES[:positiveRe] + weight += 25 + end + end - # Adjust score based on parent id - s -= 50 if parent[:id] =~ /(comment|meta|footer|footnote)/i - s += 25 if parent[:id] =~ /^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i + if e[:id] && e[:id] != "" + if e[:id] =~ REGEXES[:negativeRe] + weight -= 25 + end - # Adjust score based on # of

elements inside parent - s += parent.css("p").size + if e[:id] =~ REGEXES[:positiveRe] + weight += 25 + end + end - # Adjust score based on # of commas inside parent - s += parent.text.count "," + weight + end + + def score_node(elem) + content_score = class_weight(elem) + case elem.name.downcase + when "div": + content_score += 5 + when "blockquote": + content_score += 3 + when "form": + content_score -= 3 + when "th": + content_score -= 5 + end + { :content_score => content_score, :elem => elem } + end - s + def debug(str) + puts str if options[:debug] end - def sanitize(node) + def remove_unlikely_candidates! + @html.css("*").each do |elem| + str = "#{elem[:class]}#{elem[:id]}" + if str =~ REGEXES[:unlikelyCandidatesRe] && str !~ REGEXES[:okMaybeItsACandidateRe] && elem.name.downcase != 'body' + debug("Removing unlikely candidate - #{str}") + elem.remove + end + end + end - # Get rid of divs full of non-text items - node.css("div").each do |el| - counts = %w[p img li a embed].inject({}) { |m, kind| m[kind] = el.css(kind).length; m } - el.remove if (el.text.count(",") < 10) && (counts["p"] == 0 || counts["embed"] > 0 || counts["a"] > counts["p"] || counts["li"] > counts["p"] || counts["img"] > counts["p"]) + def transform_misused_divs_into_paragraphs! + @html.css("*").each do |elem| + if elem.name.downcase == "div" + # transform

s that do not contain other block elements into

s + if elem.inner_html !~ REGEXES[:divToPElementsRe] + debug("Altering div(##{elem[:id]}.#{elem[:class]}) to p"); + elem.name = "p" + end + else + # wrap text nodes in p tags +# elem.children.each do |child| +# if child.text? +## debug("wrapping text node with a p") +# child.swap("

#{child.text}

") +# end +# end + end + end + end + + def sanitize(node, candidates, options = {}) + node.css("h1, h2, h3, h4, h5, h6").each do |header| + header.remove if class_weight(header) < 0 || get_link_density(header) > 0.33 + end + + node.css("form, object, iframe, embed").each do |elem| + elem.remove + end + + # Conditionally clean s,
    s, and
    s + node.css("table, ul, div").each do |el| + weight = class_weight(el) + content_score = candidates[el] ? candidates[el][:content_score] : 0 + name = el.name.downcase + + if weight + content_score < 0 + el.remove + debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because score + content score was less than zero.") + elsif el.text.count(",") < 10 + counts = %w[p img li a embed input].inject({}) { |m, kind| m[kind] = el.css(kind).length; m } + counts["li"] -= 100 + + content_length = el.text.length + link_density = get_link_density(el) + to_remove = false + reason = "" + + if counts["img"] > counts["p"] + reason = "too many images" + to_remove = true + elsif counts["li"] > counts["p"] && name != "ul" && name != "ol" + reason = "more
  • s than

    s" + to_remove = true + elsif counts["input"] > (counts["p"] / 3).to_i + reason = "less than 3x

    s than s" + to_remove = true + elsif content_length < (options[:min_text_length] || TEXT_LENGTH_THRESHOLD) && (counts["img"] == 0 || counts["img"] > 2) + reason = "too short a content length without a single image" + to_remove = true + elsif weight < 25 && link_density > 0.2 + reason = "too many links for its weight (#{weight})" + to_remove = true + elsif weight >= 25 && link_density > 0.5 + reason = "too many links for its weight (#{weight})" + to_remove = true + elsif (counts["embed"] == 1 && content_length < 75) || counts["embed"] > 1 + reason = "s with too short a content length, or too many s" + to_remove = true + end + + if to_remove + debug("Conditionally cleaned #{name}##{el[:id]}.#{el[:class]} with weight #{weight} and content score #{content_score} because it has #{reason}.") + el.remove + end + end end # We'll sanitize all elements using a whitelist @@ -59,7 +275,7 @@ def sanitize(node) if whitelist[el.node_name] el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } - # Otherwise, replace the element with its contents + # Otherwise, replace the element with its contents else el.swap(el.text) end diff --git a/lib/readability_old.rb b/lib/readability_old.rb new file mode 100644 index 0000000..e137d9d --- /dev/null +++ b/lib/readability_old.rb @@ -0,0 +1,74 @@ +require 'rubygems' +require 'nokogiri' + +module Readability + class Document + + def initialize(input, options = {}) + @options = options + @html = Nokogiri::HTML(input, nil, 'UTF-8') + end + + + def content + + # Get all parent elements containing a

    tag + @parents = @html.css("p").map { |p| p.parent }.compact.uniq + + sanitize(@parents.map { |p| [p, score(p)] }.max { |a, b| a[1] <=> b[1] }[0]) + + end + + def score(parent) + s = 0 + + # Adjust score based on parent's "class" attribute + s -= 50 if parent[:class] =~ /(comment|meta|footer|footnote)/i + s += 25 if parent[:class] =~ /((^|\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\s|$))/i + + # Adjust score based on parent id + s -= 50 if parent[:id] =~ /(comment|meta|footer|footnote)/i + s += 25 if parent[:id] =~ /^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/i + + # Adjust score based on # of

    elements inside parent + s += parent.css("p").size + + # Adjust score based on # of commas inside parent + s += parent.text.count "," + + s + end + + def sanitize(node) + + # Get rid of divs full of non-text items + node.css("div").each do |el| + counts = %w[p img li a embed].inject({}) { |m, kind| m[kind] = el.css(kind).length; m } + el.remove if (el.text.count(",") < 10) && (counts["p"] == 0 || counts["embed"] > 0 || counts["a"] > counts["p"] || counts["li"] > counts["p"] || counts["img"] > counts["p"]) + end + + # We'll sanitize all elements using a whitelist + whitelist = @options[:tags] || %w[div p] + + # Use a hash for speed (don't want to make a million calls to include?) + whitelist = Hash[ whitelist.zip([true] * whitelist.size) ] + + ([node] + node.css("*")).each do |el| + + # If element is in whitelist, delete all its attributes + if whitelist[el.node_name] + el.attributes.each { |a, x| el.delete(a) unless @options[:attributes] && @options[:attributes].include?(a.to_s) } + + # Otherwise, replace the element with its contents + else + el.swap(el.text) + end + + end + + # Get rid of duplicate whitespace + node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ") + end + + end +end diff --git a/spec/fixtures/cant_read.html b/spec/fixtures/cant_read.html new file mode 100644 index 0000000..3f4075d --- /dev/null +++ b/spec/fixtures/cant_read.html @@ -0,0 +1,426 @@ + + + + + + + +BERKELEY BREATHED - Vice Magazine + + + + + + + + + + + + + + + + + + + + + + + + + +

    + + + + + + + + + + + + + + +
    + +
    +
    + +
    +
    +
    + + + + +
    + + + + + + +
+ + + + + + + +
+ + +
+ +

NEWSLETTER

+

+

DOS & DON'TS

+ + + + +
+ +He likes to hide behind a mask of anonymity, but we have it on good faith that this right here is the man behind painfully boringaustrian laptopmusicguy. blogspot.com. Comments/Enlarge | +See all + +


+ + + + +
+ +The winter hat indoors is only a little worse than sunglasses but BAPE? When did models start dressing like suburban wiggers who use Wii nunchucks and say “Get crunked”?
+ + + + Comments/Enlarge | +See all + +


+ + + + +
+ + + + + + +

+

ALSO BY JESSE PEARSON

+ + + + +
HELLO, WHITE PEOPLE!
Prussian Blue Look to the Future
THE ANTI-GOTH
Quix*o*tic Make Gloom Lovely Again
THE VICE GUIDE TO TURKISH...
Turkey has had a pretty good track record...
SID SINGS
And the Commodore 64 Massive Represent

+
See all articles by this contributor


+ + + + +
+ + + + +
+
+ + +
+
+
+ +
+
+
+ + + + + + +
+ + + +
+ +
+
+ + + +

Published December, 2009

BERKELEY BREATHED


INTERVIEW BY JESSE PEARSON, PHOTO BY JODY BOYMAN
+


For those of us who grew up as weird kids in the 1980s, the work of Berkeley Breathed was as important as those twin eternal pillars of weird-kid-dom: Monty Python and Mad magazine. In a word: seminal. In two words: fucking seminal.
+
+ Breathed’s comic strip
Bloom County ran from 1980 until 1988. It crossed the goofiness of talking penguins and drug addict cats with the topicality of stuff like nuclear anxiety and the evils of consumerism. (Remember when Opus would compulsively buy Ronco products because of infomercials?) It even won a Pulitzer Prize in 1987, though at the time we were more impressed with the Billy & the Boingers flexi-disc than some boring grown-up award.
+

A five-volume compilation of every Bloom County strip is being released now, and it’s a trip to go back and reread all the stuff that we practically had memorized over 20 years ago. It’s still as smart and hilarious as we remember it being, though it’s amazing to realize how much of the satire went over our heads. Did 12-year-old us even know who Bella Abzug was? (Actually we kinda still don’t know who that was.)

After Bloom County, Breathed did two more strips, Outland and Opus, and wrote a bunch of acclaimed children’s books, the most recent of which is called Flawed Dogs: The Shocking Raid on Westminster. It’s about a dog who goes through a series of pretty severe trials and tribulations and it made us cry. Breathed is an outspoken animal-rights activist and that often comes through in his work, especially in this novel, with its nightmarish descriptions of dogfighting and animal testing. But it’s not all glum, chum. The humor in it is as distinctive as ever and feels to us like the voice of an old pal—or maybe it’s the ghost of our childhoods. Scary!

+Anyway, Bill the Cat for president! Ack!

Vice: I’m curious as to how the writing process for Bloom County worked. Did you always know where you were heading, or was there an element of discovery as you wrote? Did you think in terms of seasons, sort of like television writing?
+ Berkeley Breathed:
Your question presumes a reality so distant from the experience that any questions about process are meaningless—but perfectly reasonable. The problem is that you’re asking a guy who didn’t think of any individual strip or story line longer than it takes to read this sentence. I drew in a manic, sweat-flinging state of deadline panic EVERY week. Not most weeks. EVERY week. For ten years. I drew what occurred to me as I stared at the same blank strips I’d been watching for six days, and only because the plane that would deliver them to my syndicate editor was due to take off at 5:30 AM, about seven hours from that moment.

Ouch.
+
This is not how a comic strip should be drawn. This is not how ANY deadline should be handled by any reasonable, conscientious, grown-up professional. But as I wasn’t, they weren’t. The flip side of that confessional coin is that Bloom County would not have been what it was—whatever it was—if I’d been that thing I just described. It was art and writing born of chaos. It was the poison the madness needed. The new book—with all the chaos intact and not edited out, as it was in books before—shows that rather intriguingly.

Can you talk a little about how you developed the looks for the main characters? What were some of the inspirations in terms of the art of the Bloom County universe—not just for the characters but also for the settings?
+
As many have read and few have doubted, Doonesbury was the stylistic key that all of us turned to in those days—college cartoonists, I mean. Jules Feiffer played a similar role for Garry Trudeau. I doubt Garry would have left the word balloons behind if it hadn’t been for Jules. I virtually didn’t have any other artistic influences, as I wasn’t familiar with other comic strips. I’m still not today. They simply were never in my sights.

Why not?
+
Comic strips didn’t tell stories well, as slow and chopped as they are. And stories—narrative, plot, character—are what still make me sweaty with creative passion.

You want to know why Bloom County was set in a rural, small-town environment? To Kill a Mockingbird. Maycomb, Alabama, was where I naturally dropped all of my imagination when it needed a setting. A therapist might help explain why, but there it is. I will say that Opus is really Scout from Mockingbird in many ways. He’s a motherless innocent, adrift and wandering about in an adult world of confusion, betrayal, and incivility. We experience it through both their eyes.

+But don’t think for a second that this occurred to me when I sketched a penguin for a throwaway gag in 1982. I show it in the new collection: Opus was meant to be dispensed with after his initial appearance. Go figger.

And when was the last time you met a penguin in real life?
+
I walked with them in Antarctica in the early 80s. Swam with them in the Galapagos in 1989. I sensed—and I might be projecting here—that they knew who I was.

Were you more of a Milo or a Binkley when you were a kid? I definitely felt very Binkley most of the time.
+
Absolutely split the difference. I think most cartoonists with an ensemble set of characters split their personality up in contrasting elements and then apply it to their characters. Not always, but to a large extent you can’t help it.

+So I was filled with self-doubt as Binkley was. But the little scheming media manipulator of Milo? Well, guilty.

As a ten-year-old kid reading your comics, a lot of the political humor went right over my head. I remember having to ask a grown-up who Jeane Kirkpatrick was because she kept popping up in Bloom County. It’s interesting that a comic could encompass a range of characters and references that has Jesse Helms on one side and the Giant Purple Snorklewacker on the other.
+
I drew what seemed amusing to me. That was the extent of my thoughtfulness when it came to designing the Bloom County world. As with most cartoonists, a comic strip is an unsavory peek into the head of its maker. Having said that, I have no inkling as to the inside of Jim Davis’s head from a reading of Garfield. It was the classic corporate invention—drawn by a staff—which made it fun to skewer. It was there to sell shit.

Speaking of that, did you ever hear about any reaction from Jim Davis regarding your statement that Bill the Cat started as a parody of Garfield?
+
Trust me, Davis could care less about being mocked. It wasn’t respect that he worked hard for.

I think that a lot of kids in the 80s sort of started with Garfield when they were really young and then graduated to Bloom County. Do you have many memories of encounters with fans of Bloom County?
+
In the heyday, I would do signings at comic-book stores, which I’d never seen before—nor the fans of such. It was a bit of a shocker. This is pre-Comic Con. I was stunned because I could never have been in one of those crowds myself. It wasn’t in my DNA. So I had to adapt to a fan base of people that I had yet to understand. I simply didn’t come from their world.

The influence I was having on the younger kids was rather sobering. Anyone who produces stories and popular art remembers when they suddenly realized that there were actual faces to the readers of one’s work and that they, in many cases, took it far more seriously than I did. I remember hearing about Harrison Ford out and out dismissing his movies’ fans as being nut jobs. He’s in the wrong business. You can sense it in his performances now. He’d rather be drunk and somewhere else. A pity. We who are lucky enough to provoke the imaginations of the public owe it to ourselves and to them to embrace the whole enchilada. It took me some years to appreciate this.

Have you ever had a crazy fan?
+
Yes indeedy, I’ve had crazy fans. Rabid. Committable. Bloom County seemed to attract mental cases like flies to horseshit. One poor adult woman kept sending me hours of videotapes of herself talking to me, but calling me by a different name. Her family finally contacted me and apologized after she stripped in one of them. It’s both sad and deeply scary that in these days, folks can find your address in two clicks. It wasn’t like this before. We live behind gates now.







See all articles by this contributor

+ + + + + + + +
+< PREV + + + + + +
+
+ +
+

Comments

Anonymous, on Feb 1, 2010 wrote:
Someone needs a dandelion break
Anonymous, on Jan 8, 2010 wrote:
I’m not camping on his lawn anytime soon, but would consider letting Berkely sleep on my lawn if was eve in town for the Kentucky Derby. And as much as Bloom County means to me, his animal rights stuff (starting with the original "Flawed Dogs" book and even the Greenpeace ’toons) is what’s getting him into heaven.
Anonymous, on Dec 26, 2009 wrote:
comics suck.
Anonymous, on Dec 24, 2009 wrote:
I disagree, I think he chose the right medium. His stuff was cool, and is still regarded as such 30 years after it was first published.
Anonymous, on Dec 24, 2009 wrote:
Dear 12-year-old budding artist,
+
+Piss off.
+ +
lukehavergal, on Dec 24, 2009 wrote:
curtis is the radiohead of comics.
+
+
+noirfair.wordpress.com
Anonymous, on Dec 24, 2009 wrote:
Trudeau owns you.
Anonymous, on Dec 24, 2009 wrote:
ahhh....now I realize why even as a 12-year-old budding artist I thought his strips sucked, graphically speaking. my folks were fans but I could never really engage due to the general crappiness and last-minute shoddiness of the art.
+
+Given that visual appeal is - let’s say conservatively - half the battle of selling comic art, pooping out on that aspect of a serial publication ( evinced by his comments re deadlines) seems like a clear indication that one might have picked the wrong medium.
+
+As contemporaries like Bill Griffith or Crumb were producing drug-hazed brilliance, this guy was making feeble attempts to join words and visuals fluidly in a commercial art context like a soggy reverse image of Hunter S. Thompson without the brilliance of Steadman to back him up.
+
+Give me some allegorical-yet-competently-drawn pablum like Walt Kelly’s Pogo, Lil’ Abner ( yeah, that shit was seminal and topical) or even Calvin and Hobbes, and keep the praise for your pitbulls, penguin guy.
Anonymous, on Dec 21, 2009 wrote:
what the fuck is wrong with you people? THIS is the longest interview you’ve read in a long time? what is it like 2,000 words? go read the david simon interview if you’ve got a year or two to kill. christ.
Anonymous, on Dec 18, 2009 wrote:
i bought "billy and the boingers" for my dad, for father’s day. he never read it and i stole it back when i was 14. flexi disc intact
Anonymous, on Dec 17, 2009 wrote:
That was the shit. I wish Bill the Cat was real.
Anonymous, on Dec 16, 2009 wrote:
long interview but really interesting! i’m a big berkeley fan!
+ +
komodo, on Dec 14, 2009 wrote:
fuck garfield. breathed for the win!
Anonymous, on Dec 14, 2009 wrote:
Longest interview I’ve read entirely in a long time. Nice work!

+
+ + + +
+
+
+POST A COMMENT [SIGN IN]
Hi, in case you haven't heard, you can now sign up to become a "member" of Viceland.com, which entitles you to all sorts of amazing benefits like pictures and a nickname. Click here to make your own profile. You can still comment if you don't, but you gotta do it all 'nonymously.

+ + + + + + + + + +
Name:
Comment:
+
+
+
+
+
+
+
+ +

+ + + + + + + + + + + + +Web Analytics + + + + + + + + \ No newline at end of file diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb index 0fdce0e..3204bac 100644 --- a/spec/readability_spec.rb +++ b/spec/readability_spec.rb @@ -1,4 +1,150 @@ require File.expand_path(File.join(File.dirname(__FILE__), "spec_helper")) describe Readability do -end \ No newline at end of file + before do + @simple_html_fixture = <<-HTML + + + title! + + +
+

a comment

+
real content
+
something in a table
+
+ + + HTML + end + + describe "transformMisusedDivsIntoParagraphs" do + before do + @doc = Readability::Document.new(@simple_html_fixture) + @doc.transform_misused_divs_into_paragraphs! + end + + it "should transform divs containing no block elements into

s" do + @doc.html.css("#body").first.name.should == "p" + end + + it "should not transform divs that contain block elements" do + @doc.html.css("#contains_blockquote").first.name.should == "div" + end + end + + describe "score_node" do + before do + @doc = Readability::Document.new(<<-HTML) + + +

+

some content

+
+ +

some other content

+ + + + HTML + @elem1 = @doc.html.css("#elem1").first + @elem2 = @doc.html.css("#elem2").first + end + + it "should like
s more than s" do + @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score] + end + + it "should like classes like text more than classes like comment" do + @elem2.name = "div" + @doc.score_node(@elem1)[:content_score].should == @doc.score_node(@elem2)[:content_score] + @elem1['class'] = "text" + @elem2['class'] = "comment" + @doc.score_node(@elem1)[:content_score].should > @doc.score_node(@elem2)[:content_score] + end + end + + describe "removeUnlikelyCandidates" do + before do + @doc = Readability::Document.new(@simple_html_fixture) + @doc.remove_unlikely_candidates! + end + + it "should remove things that have class comment" do + @doc.html.inner_html.should_not =~ /a comment/ + end + + it "should not remove body tags" do + @doc.html.inner_html.should =~ /<\/body>/ + end + + it "should not remove things with class comment and id body" do + @doc.html.inner_html.should =~ /real content/ + end + end + + describe "score_paragraphs" do + before(:each) do + @doc = Readability::Document.new(<<-HTML) + + + title! + + +
+
a comment

+
+

some text

+
+
+

some more text

+
+ + + HTML + @candidates = @doc.score_paragraphs(0) + end + + it "should score elements in the document" do + @candidates.values.length.should == 3 + end + + it "should prefer the body in this particular example" do + @candidates.values.sort { |a, b| + b[:content_score] <=> a[:content_score] + }.first[:elem][:id].should == "body" + end + end + + describe "the cant_read.html fixture" do + it "should work on the cant_read.html fixture with some allowed tags" do + allowed_tags = %w[div span table tr td p i strong u h1 h2 h3 h4 pre code br a] + allowed_attributes = %w[href] + html = File.read(File.dirname(__FILE__) + "/fixtures/cant_read.html") + Readability::Document.new(html, :tags => allowed_tags, :attributes => allowed_attributes).content.should match(/Can you talk a little about how you developed the looks for the/) + end + end + + describe "general functionality" do + before do + @doc = Readability::Document.new("title!

Some content

", + :min_text_length => 0, :retry_length => 1) + end + + it "should return the main page content" do + @doc.content.should match("Some content") + end + end + + describe "ignoring sidebars" do + before do + @doc = Readability::Document.new("title!

Some content

", + :min_text_length => 0, :retry_length => 1) + end + + it "should not return the sidebar" do + @doc.content.should_not match("sidebar") + end + end +end