From 7992fe94e2002b870754f0aa3e1aa598a50efb83 Mon Sep 17 00:00:00 2001 From: Yuji Nakayama Date: Wed, 11 Mar 2015 23:39:57 +0900 Subject: [PATCH] Introduce SummaryProcessor --- CHANGELOG.md | 3 + README.md | 28 ++- lib/qiita/markdown.rb | 3 + lib/qiita/markdown/filters/simplify.rb | 47 ++++ lib/qiita/markdown/filters/truncate.rb | 81 ++++++ lib/qiita/markdown/summary_processor.rb | 23 ++ spec/qiita/markdown/summary_processor_spec.rb | 235 ++++++++++++++++++ 7 files changed, 419 insertions(+), 1 deletion(-) create mode 100644 lib/qiita/markdown/filters/simplify.rb create mode 100644 lib/qiita/markdown/filters/truncate.rb create mode 100644 lib/qiita/markdown/summary_processor.rb create mode 100644 spec/qiita/markdown/summary_processor_spec.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c5a1d8..c452677 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +## Unreleased +- Introduce another processor Qiita::Markdown::SummaryProcessor, which is for rendering a summary of markdown document. + ## 0.2.2 - Fix a bug that raised error on rendering `` tag with href for unknown fragment inside of `` tag (e.g. `Link`) diff --git a/README.md b/README.md index bc520bd..bcf3f0f 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Qiita-specified markdown processor. * Task list * Footnotes -## Usage +## Basic Usage Qiita::Markdown::Processor provides markdown rendering logic. ```ruby @@ -62,3 +62,29 @@ processor.call(text) processor = Qiita::Markdown::Processor.new(asset_root: "http://example.com/assets") processor.call(text) ``` + +## Rendering Summary +There's another processor Qiita::Markdown::SummaryProcessor, +which is for rendering a summary of markdown document. +It simplifies a document by removing complex markups +and also truncates it to a specific length without breaking the document structure. + +Note that this processor does not produce the `:codes` output in contrast to the Processor. + +### Context + +SummaryProcessor accepts the following context in addition to the Processor's context: + +```ruby +{ + truncate: { + length: 100, # Documents will be truncated if it exceeds this character count. (Integer) + omission: '…' # A string added to the end of document when it's truncated. (String, nil) + } +} +``` + +```ruby +processor = Qiita::Markdown::SummaryProcessor.new(truncate: { length: 80 }) +processor.call(text) +``` diff --git a/lib/qiita/markdown.rb b/lib/qiita/markdown.rb index b3c5b56..45186e0 100644 --- a/lib/qiita/markdown.rb +++ b/lib/qiita/markdown.rb @@ -13,7 +13,10 @@ require "qiita/markdown/filters/mention" require "qiita/markdown/filters/redcarpet" require "qiita/markdown/filters/sanitize" +require "qiita/markdown/filters/simplify" require "qiita/markdown/filters/syntax_highlight" require "qiita/markdown/filters/toc" +require "qiita/markdown/filters/truncate" require "qiita/markdown/processor" +require "qiita/markdown/summary_processor" require "qiita/markdown/version" diff --git a/lib/qiita/markdown/filters/simplify.rb b/lib/qiita/markdown/filters/simplify.rb new file mode 100644 index 0000000..ea9c262 --- /dev/null +++ b/lib/qiita/markdown/filters/simplify.rb @@ -0,0 +1,47 @@ +module Qiita + module Markdown + module Filters + # A filter for simplifying document structure by removing complex markups + # (mainly block elements) and complex contents. + # + # The logic of this filter is similar to the `Sanitize` filter, but this + # does not use the `sanitize` gem internally for the following reasons: + # + # * Each filter should do only its own responsibility, and this filter is + # _not_ for sanitization. + # + # * The `sanitize` gem automatically adds extra transformers even if we + # want to clean up only some elements, and they would be run in the + # `Sanitize` filter later. + # https://github.com/rgrove/sanitize/blob/v3.1.2/lib/sanitize.rb#L77-L100 + class Simplify < HTML::Pipeline::Filter + SIMPLE_ELEMENTS = %w(a b code em i ins q s samp span strike strong sub sup var) + + COMPLEX_CONTENT_ELEMENTS = %w(table) + + def call + remove_complex_contents + clean_complex_markups + doc + end + + private + + # Remove complex elements along with their contents entirely. + def remove_complex_contents + selector = COMPLEX_CONTENT_ELEMENTS.join(',') + doc.search(selector).each(&:remove) + end + + # Remove complex markups while keeping their contents. + def clean_complex_markups + doc.traverse do |node| + next unless node.element? + next if SIMPLE_ELEMENTS.include?(node.name) + node.replace(node.children) + end + end + end + end + end +end diff --git a/lib/qiita/markdown/filters/truncate.rb b/lib/qiita/markdown/filters/truncate.rb new file mode 100644 index 0000000..1ac274a --- /dev/null +++ b/lib/qiita/markdown/filters/truncate.rb @@ -0,0 +1,81 @@ +module Qiita + module Markdown + module Filters + # A filter for truncating a document without breaking the document + # structure. + # + # You can pass `:length` and `:omission` option to :truncate context. + # + # @example + # Truncate.new(doc, truncate: { length: 50, omission: '... (continued)' }) + class Truncate < HTML::Pipeline::Filter + DEFAULT_OPTIONS = { + length: 100, + omission: '…'.freeze + }.freeze + + def call + @current_length = 0 + @previous_char_was_blank = false + + traverse(doc) do |node| + if exceeded? + node.remove + elsif node.text? + process_text_node(node) + end + end + + doc + end + + private + + # Traverse the given node recursively in the depth-first order. + # Note that we cannot use Nokogiri::XML::Node#traverse + # since it traverses the node's descendants _before_ the node itself. + # https://github.com/sparklemotion/nokogiri/blob/v1.6.6.2/lib/nokogiri/xml/node.rb#L571-L574 + def traverse(node, &block) + block.call(node) + + node.children.each do |child_node| + traverse(child_node, &block) + end + end + + def exceeded? + @current_length > max_length + end + + def process_text_node(node) + node.content.each_char.with_index do |char, index| + current_char_is_blank = char.strip.empty? + + if !@previous_char_was_blank || !current_char_is_blank + @current_length += 1 + end + + @previous_char_was_blank = current_char_is_blank + + if exceeded? + node.content = node.content.slice(0...(index - omission.size)) + omission + break + end + end + end + + def max_length + options[:length] + end + + def omission + options[:omission] || ''.freeze + end + + def options + @options ||= DEFAULT_OPTIONS.merge(context[:truncate] || {}) + end + end + end + end +end diff --git a/lib/qiita/markdown/summary_processor.rb b/lib/qiita/markdown/summary_processor.rb new file mode 100644 index 0000000..db68c68 --- /dev/null +++ b/lib/qiita/markdown/summary_processor.rb @@ -0,0 +1,23 @@ +module Qiita + module Markdown + # A processor for rendering a summary of markdown document. This simplifies + # a document by removing complex markups and also truncates it to a + # specific length without breaking the document structure. + class SummaryProcessor < Processor + DEFAULT_FILTERS = [ + Filters::Redcarpet, + Filters::Simplify, + HTML::Pipeline::EmojiFilter, + Filters::Mention, + Filters::Sanitize, + Filters::Truncate + ] + + # @note Modify filters if you want. + # @return [Array] + def filters + @filters ||= DEFAULT_FILTERS + end + end + end +end diff --git a/spec/qiita/markdown/summary_processor_spec.rb b/spec/qiita/markdown/summary_processor_spec.rb new file mode 100644 index 0000000..403fc76 --- /dev/null +++ b/spec/qiita/markdown/summary_processor_spec.rb @@ -0,0 +1,235 @@ +require 'active_support/core_ext/string/strip' + +describe Qiita::Markdown::SummaryProcessor do + describe '#call' do + subject(:html) do + result[:output].to_s + end + + let(:context) do + {} + end + + let(:markdown) do + fail NotImplementedError + end + + let(:result) do + described_class.new(context).call(markdown) + end + + context 'with valid condition' do + let(:markdown) do + <<-EOS.strip_heredoc + example + EOS + end + + it 'returns a Hash with HTML output and other metadata but no codes' do + expect(result[:mentioned_usernames]).to be_an Array + expect(result[:output]).to be_a Nokogiri::HTML::DocumentFragment + expect(result).not_to have_key(:codes) + end + end + + context 'with HTML-characters' do + let(:markdown) do + '<>&' + end + + it 'sanitizes them' do + should eq <<-EOS.strip_heredoc + <>& + EOS + end + end + + context 'with code' do + let(:markdown) do + <<-EOS.strip_heredoc + ```ruby + puts 'hello world' + ``` + EOS + end + + it 'returns simple code element' do + should eq <<-EOS.strip_heredoc + puts 'hello world' + + EOS + end + end + + context 'with emoji' do + let(:markdown) do + ':+1:' + end + + it 'replaces it with img element' do + should include('img') + end + end + + context 'with image' do + let(:markdown) do + <<-EOS.strip_heredoc + ![Qiita](http://qiita.com/icons/favicons/public/apple-touch-icon.png) + EOS + end + + it 'removes it' do + expect(html.strip).to be_empty + end + end + + context 'with line breaks' do + let(:markdown) do + <<-EOS.strip_heredoc + foo + bar + EOS + end + + it 'removes them' do + should eq <<-EOS.strip_heredoc + foo + bar + EOS + end + end + + context 'with paragraphs' do + let(:markdown) do + <<-EOS.strip_heredoc + Lorem ipsum dolor sit amet. + + Consectetur adipisicing elit. + EOS + end + + it 'flattens them' do + should eq <<-EOS.strip_heredoc + Lorem ipsum dolor sit amet. + + Consectetur adipisicing elit. + EOS + end + end + + context 'with normal list items' do + let(:markdown) do + <<-EOS.strip_heredoc + - foo + - bar + EOS + end + + it 'flattens them' do + should eq <<-EOS.strip_heredoc + + foo + bar + + EOS + end + end + + context 'with task list items' do + let(:markdown) do + <<-EOS.strip_heredoc + - [ ] foo + - [x] bar + EOS + end + + it 'flattens them without converting to checkboxes' do + should eq <<-EOS.strip_heredoc + + [ ] foo + [x] bar + + EOS + end + end + + context 'with table' do + let(:markdown) do + <<-EOS.strip_heredoc + | a | b | c | + |---|---|---| + | a | b | c | + EOS + end + + it 'removes it entirely' do + expect(html.strip).to be_empty + end + end + + context 'with a simple long document' do + before do + context[:truncate] = { length: 10 } + end + + let(:markdown) do + <<-EOS.strip_heredoc + Lorem ipsum dolor sit amet. + EOS + end + + it 'truncates it to the specified length' do + should eq 'Lorem ips…' + end + end + + context 'with a long document consisting of nested elements' do + before do + context[:truncate] = { length: 10 } + end + + let(:markdown) do + <<-EOS.strip_heredoc + _[Qiita](http://qiita.com/) is **a technical knowledge sharing and collaboration platform for programmers**._ + EOS + end + + it 'truncates it while honoring the document structure' do + should eq 'Qiita is ' + end + end + + context 'with a long document including consecutive whitespaces' do + before do + context[:truncate] = { length: 10 } + end + + let(:markdown) do + <<-EOS.strip_heredoc + **12** 4 [ 6](http://qiita.com/)_7 + 9_ 123 + EOS + end + + it 'truncates it while counting the consecutive whilespaces as one' do + should eq "12 4 67\n9…" + end + end + + context 'with truncate: { omission: nil } context' do + before do + context[:truncate] = { length: 10, omission: nil } + end + + let(:markdown) do + <<-EOS.strip_heredoc + Lorem ipsum dolor sit amet. + EOS + end + + it 'does not add extra omission text' do + should eq 'Lorem ipsu' + end + end + end +end