Skip to content
Browse files

fetcher, builder: filtering code moved to pre- and postprocessing mod…

…ules
  • Loading branch information...
1 parent 35b6a42 commit 807960c06cd5dc24f56574403cd378bf74574a86 @invisiblellama committed Jul 17, 2009
View
2 README.rdoc
@@ -116,8 +116,6 @@ Parser options:
-m, --meta NAME:VALUE Set publication information metadata NAME to VALUE.
Valid metadata names are: [creator date description
language publisher relation rights subject title]
- -F, --no-fixup Do not attempt to make document meet XHTML 1.0 Strict.
- Default is to try and fix things that are broken.
-e, --encoding NAME Set source document encoding. Default is to autodetect.
Post-processing options:
View
4 lib/repub/app.rb
@@ -6,8 +6,8 @@
require 'repub/app/options'
require 'repub/app/profile'
require 'repub/app/filter'
-require 'repub/app/prefilters'
-require 'repub/app/postfilters'
+require 'repub/app/pre_filters'
+require 'repub/app/post_filters'
require 'repub/app/fetcher'
require 'repub/app/parser'
require 'repub/app/builder'
View
238 lib/repub/app/builder.rb
@@ -81,15 +81,15 @@ def build(parser)
def copy_and_process_assets
# Copy html
- @parser.cache.assets[:documents].each do |doc|
- log.debug "-- Processing document #{doc}"
+ @parser.cache.assets[:documents].each do |file|
+ log.debug "-- Processing document #{file}"
# Copy asset from cache
- FileUtils.cp(File.join(@parser.cache.path, doc), '.')
+ FileUtils.cp(File.join(@parser.cache.path, file), '.')
# Do post-processing
- postprocess_file(doc)
- postprocess_doc(doc)
- @opf << doc
- @document_path = File.expand_path(doc)
+ apply_file_filters(file)
+ apply_document_filters(file)
+ @opf << file
+ @document_path = File.expand_path(file)
end
# Copy css
@@ -121,117 +121,129 @@ def copy_and_process_assets
@opf << file
end if @options[:add]
end
-
- def postprocess_file(asset)
- source = IO.read(asset)
-
- # Do rx substitutions
- @options[:rx].each do |rx|
- rx.strip!
- delimiter = rx[0, 1]
- rx = rx.gsub(/\\#{delimiter}/, "\n")
- ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
- raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
- pattern = ra[0]
- replacement = ra[1] || ''
- log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
- source.gsub!(Regexp.new(pattern), replacement)
- end if @options[:rx]
- # Remove xml preamble if any
- preamble_rx = /^\s*<\?xml\s+[^>]+>\s*/mi
- if source =~ preamble_rx
- log.debug "-- Removing xml preamble"
- source.sub!(preamble_rx, '')
- end
-
- # Replace doctype
- doctype_rx = /^\s*<!DOCTYPE\s+[^>]+>\s*/mi
- if source =~ doctype_rx
- source.sub!(doctype_rx, '')
- end
- log.debug "-- Replacing doctype"
- source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + source
-
- # Save processed file
- File.open(asset, 'w') do |f|
- f.write(source)
- end
+ def apply_file_filters(file)
+ s = PostFilters::FileFilters.apply_filters(IO.read(file), @options)
+ File.open(file, 'w') { |f| f.write(s) }
end
- def postprocess_doc(asset)
- doc = Nokogiri::HTML.parse(IO.read(asset), nil, 'UTF-8')
-
- # Set Content-Type charset to UTF-8
- doc.xpath('//head/meta[@http-equiv="Content-Type"]').each do |el|
- el['content'] = 'text/html; charset=utf-8'
- end
-
- # Process styles
- if @options[:css] && !@options[:css].empty?
- # Remove all stylesheet links
- doc.xpath('//head/link[@rel="stylesheet"]').remove
- if @options[:css] == '-'
- # Also remove all inline styles
- doc.xpath('//head/style').remove
- log.info "Removing all stylesheet links and style elements"
- else
- # Add custom stylesheet link
- link = Nokogiri::XML::Node.new('link', doc)
- link['rel'] = 'stylesheet'
- link['type'] = 'text/css'
- link['href'] = File.basename(@options[:css])
- # Add as the last child so it has precedence over (possible) inline styles before
- doc.at('//head').add_child(link)
- log.info "Replacing CSS refs with \"#{link['href']}\""
- end
- end
-
- # Insert elements after/before selector
- @options[:after].each do |e|
- selector = e.keys.first
- fragment = e[selector]
- element = doc.xpath(selector).first
- if element
- log.info "Inserting fragment \"#{fragment.to_html}\" after \"#{selector}\""
- fragment.children.to_a.reverse.each {|node| element.add_next_sibling(node) }
- end
- end if @options[:after]
- @options[:before].each do |e|
- selector = e.keys.first
- fragment = e[selector]
- element = doc.xpath(selector).first
- if element
- log.info "Inserting fragment \"#{fragment}\" before \"#{selector}\""
- fragment.children.to_a.each {|node| element.add_previous_sibling(node) }
- end
- end if @options[:before]
-
- # Remove elements
- @options[:remove].each do |selector|
- log.info "Removing elements \"#{selector}\""
- doc.search(selector).remove
- end if @options[:remove]
-
- # XXX
- # doc.xpath('//body/a').each do |a|
- # wrapper = Nokogiri::XML::Node.new('p', doc)
- # a.add_next_sibling(wrapper)
- # wrapper << a
- # end
-
- # Save processed doc
- File.open(asset, 'w') do |f|
- if @options[:fixup] || true
- # HACK: Nokogiri seems to ignore the fact that xmlns and other attrs aleady present
- # in html node and adds them anyway. Just remove them here to avoid duplicates.
- doc.root.attributes.each {|name, value| doc.root.remove_attribute(name) }
- doc.write_xhtml_to(f, :encoding => 'UTF-8')
- else
- doc.write_html_to(f, :encoding => 'UTF-8')
- end
+ def apply_document_filters(file)
+ doc = Nokogiri::HTML.parse(IO.read(file), nil, 'UTF-8')
+ doc = PostFilters::DocumentFilters.apply_filters(doc, @options)
+ File.open(file, 'w') do |f|
+ # HACK: Nokogiri seems to ignore the fact that xmlns and other attrs aleady present
+ # in html node and adds them anyway. Just remove them here to avoid duplicates.
+ doc.root.attributes.each {|name, value| doc.root.remove_attribute(name) }
+ doc.write_xhtml_to(f, :encoding => 'UTF-8')
end
end
+
+ # def postprocess_file(asset)
+ # source = IO.read(asset)
+ #
+ # # Do rx substitutions
+ # @options[:rx].each do |rx|
+ # rx.strip!
+ # delimiter = rx[0, 1]
+ # rx = rx.gsub(/\\#{delimiter}/, "\n")
+ # ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
+ # raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
+ # pattern = ra[0]
+ # replacement = ra[1] || ''
+ # log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
+ # source.gsub!(Regexp.new(pattern), replacement)
+ # end if @options[:rx]
+ #
+ # # Remove xml preamble if any
+ # preamble_rx = /^\s*<\?xml\s+[^>]+>\s*/mi
+ # if source =~ preamble_rx
+ # log.debug "-- Removing xml preamble"
+ # source.sub!(preamble_rx, '')
+ # end
+ #
+ # # Replace doctype
+ # doctype_rx = /^\s*<!DOCTYPE\s+[^>]+>\s*/mi
+ # if source =~ doctype_rx
+ # source.sub!(doctype_rx, '')
+ # end
+ # log.debug "-- Replacing doctype"
+ # source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + source
+ #
+ # # Save processed file
+ # File.open(asset, 'w') do |f|
+ # f.write(source)
+ # end
+ # end
+
+ # def postprocess_doc(asset)
+ # doc = Nokogiri::HTML.parse(IO.read(asset), nil, 'UTF-8')
+ #
+ # # Set Content-Type charset to UTF-8
+ # doc.xpath('//head/meta[@http-equiv="Content-Type"]').each do |el|
+ # el['content'] = 'text/html; charset=utf-8'
+ # end
+ #
+ # # Process styles
+ # if @options[:css] && !@options[:css].empty?
+ # # Remove all stylesheet links
+ # doc.xpath('//head/link[@rel="stylesheet"]').remove
+ # if @options[:css] == '-'
+ # # Also remove all inline styles
+ # doc.xpath('//head/style').remove
+ # log.info "Removing all stylesheet links and style elements"
+ # else
+ # # Add custom stylesheet link
+ # link = Nokogiri::XML::Node.new('link', doc)
+ # link['rel'] = 'stylesheet'
+ # link['type'] = 'text/css'
+ # link['href'] = File.basename(@options[:css])
+ # # Add as the last child so it has precedence over (possible) inline styles before
+ # doc.at('//head').add_child(link)
+ # log.info "Replacing CSS refs with \"#{link['href']}\""
+ # end
+ # end
+ #
+ # # Insert elements after/before selector
+ # @options[:after].each do |e|
+ # selector = e.keys.first
+ # fragment = e[selector]
+ # element = doc.xpath(selector).first
+ # if element
+ # log.info "Inserting fragment \"#{fragment.to_html}\" after \"#{selector}\""
+ # fragment.children.to_a.reverse.each {|node| element.add_next_sibling(node) }
+ # end
+ # end if @options[:after]
+ # @options[:before].each do |e|
+ # selector = e.keys.first
+ # fragment = e[selector]
+ # element = doc.xpath(selector).first
+ # if element
+ # log.info "Inserting fragment \"#{fragment}\" before \"#{selector}\""
+ # fragment.children.to_a.each {|node| element.add_previous_sibling(node) }
+ # end
+ # end if @options[:before]
+ #
+ # # Remove elements
+ # @options[:remove].each do |selector|
+ # log.info "Removing elements \"#{selector}\""
+ # doc.search(selector).remove
+ # end if @options[:remove]
+ #
+ # # XXX
+ # # doc.xpath('//body/a').each do |a|
+ # # wrapper = Nokogiri::XML::Node.new('p', doc)
+ # # a.add_next_sibling(wrapper)
+ # # wrapper << a
+ # # end
+ #
+ # # Save processed doc
+ # File.open(asset, 'w') do |f|
+ # # HACK: Nokogiri seems to ignore the fact that xmlns and other attrs aleady present
+ # # in html node and adds them anyway. Just remove them here to avoid duplicates.
+ # doc.root.attributes.each {|name, value| doc.root.remove_attribute(name) }
+ # doc.write_xhtml_to(f, :encoding => 'UTF-8')
+ # end
+ # end
end
end
View
65 lib/repub/app/fetcher.rb
@@ -4,7 +4,7 @@
require 'iconv'
require 'rubygems'
-# Temporary disable warnings from chardet
+# Disable warnings from chardet
old_verbose = $VERBOSE
$VERBOSE = false
require 'UniversalDetector'
@@ -17,7 +17,7 @@ module Fetcher
class FetcherException < RuntimeError; end
def fetch
- Fetcher.new(options).fetch
+ FetcherSupport.new(options).fetch
end
AssetTypes = {
@@ -26,7 +26,7 @@ def fetch
:images => %w[jpg jpeg png gif svg]
}
- class Fetcher
+ class FetcherSupport
include Logger
Downloaders = {
@@ -63,74 +63,21 @@ def fetch
raise FetcherException, "Fetch failed."
end
unless cache.cached?
- preprocess
- #fix_filenames(cache)
- #fix_encoding(cache, @options[:encoding])
+ preprocess cache
end
end
end
private
- def preprocess
+ def preprocess(cache)
cache.assets[:documents].each do |file|
log.info "Preprocessing #{file}"
s = PreFilters.apply_filters(IO.read(file), @options)
File.open(file, 'w') { |f| f.write(s) }
end
end
- # HACK HACK HACK
- # ADE seems to have problems following TOC in content files with .htm extension
- # Renaming these files to .html and fix references inside them
- #
- def fix_filenames(cache)
- # # TODO: fix non-alphanum characters in doc filenames
- # documents = []
- # cache.assets[:documents].each do |file_name|
- # if file_name =~ /\.htm$/i
- # proper_name = file_name.gsub($&, '.html')
- # FileUtils.mv(file_name, proper_name)
- # s = IO.read(proper_name)
- # raise FetcherException, "empty document" unless s
- # s.gsub!(file_name, proper_name)
- # File.open(proper_name, 'w') { |f| f.write(s) }
- # documents << proper_name
- # else
- # documents << file_name
- # end
- # end
- # cache.assets[:documents] = documents
-
- # XXX
- cache.assets[:documents].each do |file_name|
- s = IO.read(file_name)
- m = s.scan(/\s+(?:id|name)\s*?=\s*?['"](\d+[^'"]*)['"]/im)
- unless m.empty?
- m.each do |i|
- s.gsub!(i[0], "a#{i[0]}")
- end
- File.open(file_name, 'w') { |f| f.write(s) }
- end
- end
-
- end
-
- def fix_encoding(cache, encoding = nil)
- cache.assets[:documents].each do |file_name|
- unless encoding
- log.info "Detecting encoding for #{file_name}"
- s = IO.read(file_name)
- raise FetcherException, "empty document" unless s
- encoding = UniversalDetector.chardet(s)['encoding']
- end
- if encoding.downcase != 'utf-8'
- log.info "Source encoding appears to be #{encoding}, converting to UTF-8"
- s = Iconv.conv('utf-8', encoding, s)
- File.open(file_name, 'w') { |f| f.write(s) }
- end
- end
- end
-
+
def which(cmd)
if !RUBY_PLATFORM.match('mswin')
cmd = `/usr/bin/which #{cmd}`.strip
View
1 lib/repub/app/filter.rb
@@ -12,6 +12,7 @@ def self.included(base)
attr_reader :options
end
base.extend(ClassMethods)
+ base.extend(Logger)
end
def options
View
6 lib/repub/app/options.rb
@@ -17,7 +17,6 @@ def parse_options(args)
:browser => false,
:css => nil,
:encoding => nil,
- :fixup => true,
:helper => 'wget',
:metadata => {},
:output_path => Dir.getwd,
@@ -119,11 +118,6 @@ def parse_options(args)
options[:metadata][name.to_sym] = value
end
- opts.on("-F", "--no-fixup",
- "Do not attempt to make document meet XHTML 1.0 Strict.",
- "Default is to try and fix things that are broken. "
- ) { |value| options[:fixup] = false }
-
opts.on("-e", "--encoding NAME", String,
"Set source document encoding. Default is to autodetect."
) { |value| options[:encoding] = value }
View
1 lib/repub/app/parser.rb
@@ -31,7 +31,6 @@ class Parser
def initialize(options)
@selectors = options[:selectors] || Selectors
- @fixup = options[:fixup]
end
# Parse downloaded asset cache
View
135 lib/repub/app/post_filters.rb
@@ -0,0 +1,135 @@
+require 'repub/app/filter'
+
+module Repub
+ class App
+ class PostFilters
+
+ class FileFilters
+ include Filter
+
+ # Do rx substitutions
+ #
+ filter :do_rxes do |s|
+ options[:rx].each do |rx|
+ rx.strip!
+ delimiter = rx[0, 1]
+ rx = rx.gsub(/\\#{delimiter}/, "\n")
+ ra = rx.split(/#{delimiter}/).reject {|e| e.empty? }.each {|e| e.gsub!(/\n/, "#{delimiter}")}
+ raise ParserException, "Invalid regular expression" if ra.empty? || ra[0].nil? || ra.size > 2
+ pattern = ra[0]
+ replacement = ra[1] || ''
+ log.info "Replacing pattern /#{pattern.gsub(/#{delimiter}/, "\\#{delimiter}")}/ with \"#{replacement}\""
+ s.gsub!(Regexp.new(pattern), replacement)
+ end if options[:rx]
+ s
+ end
+
+ # Remove xml preamble if any
+ #
+ filter :fix_xml_preamble do |s|
+ preamble_rx = /^\s*<\?xml\s+[^>]+>\s*/mi
+ if s =~ preamble_rx
+ log.debug "-- Removing xml preamble"
+ s.sub!(preamble_rx, '')
+ end
+ s
+ end
+
+ # Replace doctype
+ #
+ filter :fix_doctype do |s|
+ doctype_rx = /^\s*<!DOCTYPE\s+[^>]+>\s*/mi
+ if s =~ doctype_rx
+ s.sub!(doctype_rx, '')
+ end
+ log.debug "-- Replacing doctype"
+ s = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + s
+ s
+ end
+ end
+
+ class DocumentFilters
+ include Filter
+
+ # Set Content-Type charset to UTF-8
+ #
+ filter :fix_content_type do |doc|
+ doc.xpath('//head/meta[@http-equiv="Content-Type"]').each do |el|
+ el['content'] = 'text/html; charset=utf-8'
+ end
+ doc
+ end
+
+ # Process styles
+ #
+ filter :fix_styles do |doc|
+ if options[:css] && !options[:css].empty?
+ # Remove all stylesheet links
+ doc.xpath('//head/link[@rel="stylesheet"]').remove
+ if options[:css] == '-'
+ # Also remove all inline styles
+ doc.xpath('//head/style').remove
+ log.info "Removing all stylesheet links and style elements"
+ else
+ # Add custom stylesheet link
+ link = Nokogiri::XML::Node.new('link', doc)
+ link['rel'] = 'stylesheet'
+ link['type'] = 'text/css'
+ link['href'] = File.basename(@options[:css])
+ # Add as the last child so it has precedence over (possible) inline styles before
+ doc.at('//head').add_child(link)
+ log.info "Replacing CSS refs with \"#{link['href']}\""
+ end
+ end
+ doc
+ end
+
+ # Insert elements after/before selector
+ #
+ filter :do_inserts do |doc|
+ options[:after].each do |e|
+ selector = e.keys.first
+ fragment = e[selector]
+ element = doc.xpath(selector).first
+ if element
+ log.info "Inserting fragment \"#{fragment.to_html}\" after \"#{selector}\""
+ fragment.children.to_a.reverse.each {|node| element.add_next_sibling(node) }
+ end
+ end if options[:after]
+ options[:before].each do |e|
+ selector = e.keys.first
+ fragment = e[selector]
+ element = doc.xpath(selector).first
+ if element
+ log.info "Inserting fragment \"#{fragment}\" before \"#{selector}\""
+ fragment.children.to_a.each {|node| element.add_previous_sibling(node) }
+ end
+ end if options[:before]
+ doc
+ end
+
+ # Remove elements
+ #
+ filter :do_removes do |doc|
+ options[:remove].each do |selector|
+ log.info "Removing elements \"#{selector}\""
+ doc.search(selector).remove
+ end if options[:remove]
+ doc
+ end
+
+ # TODO: XHTML requires a to have embedding element
+ # filter :wrap_anchors do |doc|
+ # log.info "Wrapping anchors"
+ # doc.xpath('//body/a').each do |a|
+ # wrapper = Nokogiri::XML::Node.new('p', doc)
+ # a.add_next_sibling(wrapper)
+ # wrapper << a
+ # end
+ # doc
+ # end
+ end
+
+ end
+ end
+end
View
0 lib/repub/app/postfilters.rb
No changes.
View
18 lib/repub/app/prefilters.rb → lib/repub/app/pre_filters.rb
@@ -2,37 +2,37 @@
module Repub
class App
- class PreFilter
+ class PreFilters
include Filter
# Detect and convert source encoding
# Standard requires it to be UTF-8
#
- filter :fix_encoding do |content|
+ filter :fix_encoding do |s|
encoding = options[:encoding]
unless encoding
log.info "Detecting encoding"
- encoding = UniversalDetector.chardet(content)['encoding']
+ encoding = UniversalDetector.chardet(s)['encoding']
end
if encoding.downcase != 'utf-8'
log.info "Source encoding appears to be #{encoding}, converting to UTF-8"
- content = Iconv.conv('utf-8', encoding, content)
+ s = Iconv.conv('utf-8', encoding, s)
end
- content
+ s
end
# Find and fix all elements with id or name attributes beginning with digit
# ADE wont follow links referencing such ids
#
- filter :fix_ids do |content|
- match = content.scan(/\s+(?:id|name)\s*?=\s*?['"](\d+[^'"]*)['"]/im)
+ filter :fix_ids do |s|
+ match = s.scan(/\s+(?:id|name)\s*?=\s*?['"](\d+[^'"]*)['"]/im)
unless match.empty?
log.debug "-- Fixing broken element IDs"
match.each do |m|
- content.gsub!(m[0], "x#{m[0]}")
+ s.gsub!(m[0], "x#{m[0]}")
end
end
- content
+ s
end
end
View
2 lib/repub/app/profile.rb
@@ -5,7 +5,7 @@ module Repub
class App
module Profile
- PROFILE_KEYS = %w[css encoding fixup helper metadata remove rx selectors].map {|k| k.to_sym}
+ PROFILE_KEYS = %w[css encoding helper metadata remove rx selectors].map {|k| k.to_sym}
def load_profile(name = nil)
name ||= 'default'
View
28 test/test_filter.rb
@@ -0,0 +1,28 @@
+require "test/unit"
+
+require 'repub'
+require 'repub/app'
+
+class TestFilter < Test::Unit::TestCase
+ include Repub::App::Filter
+
+ filter :filter_1 do |s|
+ log.info 'in filter_1'
+ s.upcase
+ end
+
+ filter :filter_2 do |s|
+ log.info 'in filter_2'
+ "++ #{s} --"
+ end
+
+ filter :filter_3 do |s|
+ log.info 'in filter_3'
+ s.gsub(/\s/, '|')
+ end
+
+ def test_case_name
+ res = TestFilter.apply_filters('klaatu barada nikto')
+ assert_equal('++|KLAATU|BARADA|NIKTO|--', res)
+ end
+end
View
42 untitled.rb
@@ -1,42 +0,0 @@
-#!/usr/bin/env ruby
-
-module Filter
-
- def self.included(base)
- (class << base; self; end).instance_eval do
- define_method(:filter) do |name, &block|
- @filters ||= []
- @filters << {:name => name, :proc => Proc.new(&block) }
- end
- attr_reader :filters
- attr_reader :options
- end
- base.extend(ClassMethods)
- end
-
- def options
- self.class.options
- end
-
- module ClassMethods
- def apply_filters(input, options = nil)
- @options = options
- @filters.inject(input) { |input, filter| filter[:proc].call(input) }
- end
- end
-end
-
-class FilterTest
- include Filter
-
- filter :filter_1 do |s|
- p options
- s.upcase
- end
-
- filter :filter_2 do |s|
- "++ #{s} --"
- end
-end
-
-p FilterTest.apply_filters('hi there')

0 comments on commit 807960c

Please sign in to comment.
Something went wrong with that request. Please try again.