From 8de76f960a599b3d71f350603304978af23e96b6 Mon Sep 17 00:00:00 2001 From: Dmitri Goutnik Date: Sun, 28 Jun 2009 14:44:38 +0400 Subject: [PATCH] parser and builder fixes for Nokogiri --- bin/repub | 15 +++++++------- lib/repub/app.rb | 1 - lib/repub/app/builder.rb | 33 +++++++++++++++++-------------- lib/repub/app/options.rb | 2 -- lib/repub/app/parser.rb | 42 +++++++++++++++++++++------------------- 5 files changed, 49 insertions(+), 44 deletions(-) diff --git a/bin/repub b/bin/repub index 0ef6d71..8b8b48c 100755 --- a/bin/repub +++ b/bin/repub @@ -6,20 +6,21 @@ require File.expand_path( require 'repub/app' # THE ADVENTURES OF SHERLOCK HOLMES -# repub -x 'title:body/h1' -x 'toc:body//table' 'toc_item://tr' -X 'body/pre,body//hr,body/h1,body/h2' http://www.gutenberg.org/dirs/etext99/advsh12h.htm +# repub -x 'title:div[@class='book']//h1' -x 'toc://table' -x 'toc_item://tr' -X '//pre' -X '//hr' -X '//body/h1' -X '//body/h2' http://www.gutenberg.org/dirs/etext99/advsh12h.htm # # ALICE'S ADVENTURES IN WONDERLAND -# repub -x 'title:body/h1' -x 'toc:body//table' -x 'toc_item://tr' -X 'body/pre,body//hr,body/h4' http://www.gutenberg.org/files/11/11-h/11-h.htm +# repub -x 'title:body/h1' -x 'toc://table' -x 'toc_item://tr' -X '//pre' -X '//hr' -X '//body/h4' http://www.gutenberg.org/files/11/11-h/11-h.htm # # The Gelug-Kagyu Tradition of Mahamudra -# http://www.berzinarchives.com/web/x/prn/p.html_680632258.html +# repub http://www.berzinarchives.com/web/x/prn/p.html_680632258.html # # Брюс Стерлинг. Схизматрица -# repub -x 'title://h2' -x 'toc:table' -x 'toc_item://a' -X 'div,table,//hr' http://lib.ru/STERLINGB/shizmatrica.txt_with-big-pictures.html +# repub -x 'title://h2' -x 'toc://table' -x 'toc_item://a' -X 'div' -X 'table' -X '//hr' http://lib.ru/STERLINGB/shizmatrica.txt_with-big-pictures.html # +# Айзек Азимов. Космические течения +# repub -B -v -x 'title://h2' -x 'toc://table' -x 'toc_item://a' -X 'div' -X 'table' -X '//hr' http://lib.ru/FOUNDATION/currspac.txt_with-big-pictures.html +# # Git User's Manual -# repub -x 'title://h1' -x 'toc://div.toc/dl' -x 'toc_item:/dt' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html -# with hierarchy -# repub -x 'title://h1' -x 'toc://div.toc'/dl -x 'toc_item:/dt' -x 'toc_section:/dl' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html +# repub -x 'title://h1' -x 'toc://div[@class="toc"]/dl' -x 'toc_item:dt' -x 'toc_section:following-sibling::*[1]/dl' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html Repub::App.instance.run ARGV diff --git a/lib/repub/app.rb b/lib/repub/app.rb index dc2ec2b..b457f14 100644 --- a/lib/repub/app.rb +++ b/lib/repub/app.rb @@ -26,7 +26,6 @@ def self.data_path def run(args) parse_options(args) - exit log.level = options[:verbosity] log.info "Making ePub from #{options[:url]}" diff --git a/lib/repub/app/builder.rb b/lib/repub/app/builder.rb index afd30ae..ab08d6f 100644 --- a/lib/repub/app/builder.rb +++ b/lib/repub/app/builder.rb @@ -104,14 +104,10 @@ def postprocess_file(asset) end def postprocess_doc(asset) - - # XXX !!! TODO - #doc = Hpricot(open(asset), :xhtml_strict => @options[:fixup]) - # - - # Substitute custom stylesheet + doc = Nokogiri::HTML.parse(open(asset), nil, 'UTF-8') + # Substitute custom CSS if (@options[:css] && !@options[:css].empty?) - doc.search('//link[@rel="stylesheet"]') do |link| + doc.xpath('//link[@rel="stylesheet"]') do |link| link[:href] = File.basename(@options[:css]) log.debug "-- Replacing CSS refs with #{link[:href]}" end @@ -119,17 +115,26 @@ def postprocess_doc(asset) # Remove elements if @options[:remove] && !@options[:remove].empty? @options[:remove].each do |selector| - log.info "Removing element(s) matching selector \"#{selector}\"" + log.info "Removing elements matching selector \"#{selector}\"" + #p doc.search(selector).size + #p doc.search(selector) doc.search(selector).remove end end - # Translate a name -> id - doc.search('//a[@name]') do |a| - a[:id] = a[:name] - end - # Overwrite asset with fixed version + # XXX + # doc.search('//a[@name and not(@id)]') do |a| + # a[:id] = a[:name] + # end + # Save processed version File.open(asset, 'w') do |f| - f << doc.to_html + if @options[:fixup] + # HACK: Nokogiri seems to ignore the fact that xmlns and other attrs aleady present and adds them anyway + # So we just remove them here to avoid duplicates + doc.root.attributes.each {|name, value| doc.root.remove_attribute(name) } + doc.write_xhtml_to(f, :encoding => 'UTF-8') + else + doc.write_html_to(f, :encoding => 'UTF-8') + end end end diff --git a/lib/repub/app/options.rb b/lib/repub/app/options.rb index a66d159..5d90dfc 100644 --- a/lib/repub/app/options.rb +++ b/lib/repub/app/options.rb @@ -158,14 +158,12 @@ def parse_options(args) parser.parse! args rescue OptionParser::ParseError => ex log.fatal "ERROR: #{ex.to_s}. See '#{App.name} --help'." - exit 1 end options[:url] = args.last if options[:url].nil? || options[:url].empty? help parser log.fatal "ERROR: Please specify an URL." - exit 1 end end diff --git a/lib/repub/app/parser.rb b/lib/repub/app/parser.rb index 95ffdef..73fab45 100644 --- a/lib/repub/app/parser.rb +++ b/lib/repub/app/parser.rb @@ -15,9 +15,9 @@ def parse(cache) # Selectors = { :title => '//h1', - :toc => '//div.toc/ul', - :toc_item => '/li', - :toc_section => '/ul' + :toc => '//div[@class="toc"]/ul', + :toc_item => './li', + :toc_section => './ul' } class Parser @@ -43,7 +43,7 @@ def parse(cache) @cache = cache @asset = @cache.assets[:documents][0] log.debug "-- Parsing #{@asset}" - @doc = Nokogiri::HTML(open(File.join(@cache.path, @asset))) + @doc = Nokogiri::HTML.parse(open(File.join(@cache.path, @asset)), nil, 'UTF-8') @uid = @cache.name parse_title @@ -64,7 +64,7 @@ def parse_title if el.children.empty? title_text = el.inner_text else - title_text = el.children.map{|c| c.inner_text }.join(' ') + title_text = el.children.map{|c| c.inner_text }.join(' ') end @title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip log.info "Found title \"#{@title}\"" @@ -102,7 +102,7 @@ def src def parse_toc log.debug "-- Looking for TOC with #{@selectors[:toc]}" - el = @doc.at(@selectors[:toc]) + el = @doc.xpath(@selectors[:toc]).first if el @toc = parse_toc_section(el) log.info "Found TOC with #{@toc.size} top-level items" @@ -115,24 +115,26 @@ def parse_toc def parse_toc_section(section) toc = [] log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}" - section.search(@selectors[:toc_item]).each do |item| + section.xpath(@selectors[:toc_item]).each do |item| a = item.name == 'a' ? item : item.at('a') + next if !a href = a[:href] - next if !a || !href - title = item.inner_text.gsub(/\s+/, ' ').strip + next if !href + if item.children.empty? + title = item.inner_text + else + title = item.children.map{|c| c.inner_text }.join(' ') + end + title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip log.debug "-- Found item: #{title}" - subitems = nil - subsections = item.search(@selectors[:toc_section]) - #p "++ #{item.search(@selectors[:toc_section])}" - #p "== #{subsections.size}" if subsections - p "== #{item}" if subsections - p "== #{@selectors[:toc_section]} #{subsections}" if subsections - #p subsections.size if subsections - subsections.each do |subsection| - log.debug "-- Found section with #{@selectors[:toc_section]} >>>" + subsection = item.xpath(@selectors[:toc_section]).first + #p subsection + if subsection + log.debug "-- Found section with #{@selectors[:toc_section]}" + log.debug "-- >" subitems = parse_toc_section(subsection) - log.debug '-- <<<' - end if subsections + log.debug '-- .' + end toc << TocItem.new(title, href, subitems, @asset) end toc