Skip to content

Commit

Permalink
parser and builder fixes for Nokogiri
Browse files Browse the repository at this point in the history
  • Loading branch information
Dmitri Goutnik committed Jun 28, 2009
1 parent ee034f4 commit 8de76f9
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 44 deletions.
15 changes: 8 additions & 7 deletions bin/repub
Expand Up @@ -6,20 +6,21 @@ require File.expand_path(
require 'repub/app'

# THE ADVENTURES OF SHERLOCK HOLMES
# repub -x 'title:body/h1' -x 'toc:body//table' 'toc_item://tr' -X 'body/pre,body//hr,body/h1,body/h2' http://www.gutenberg.org/dirs/etext99/advsh12h.htm
# repub -x 'title:div[@class='book']//h1' -x 'toc://table' -x 'toc_item://tr' -X '//pre' -X '//hr' -X '//body/h1' -X '//body/h2' http://www.gutenberg.org/dirs/etext99/advsh12h.htm
#
# ALICE'S ADVENTURES IN WONDERLAND
# repub -x 'title:body/h1' -x 'toc:body//table' -x 'toc_item://tr' -X 'body/pre,body//hr,body/h4' http://www.gutenberg.org/files/11/11-h/11-h.htm
# repub -x 'title:body/h1' -x 'toc://table' -x 'toc_item://tr' -X '//pre' -X '//hr' -X '//body/h4' http://www.gutenberg.org/files/11/11-h/11-h.htm
#
# The Gelug-Kagyu Tradition of Mahamudra
# http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
# repub http://www.berzinarchives.com/web/x/prn/p.html_680632258.html
#
# Брюс Стерлинг. Схизматрица
# repub -x 'title://h2' -x 'toc:table' -x 'toc_item://a' -X 'div,table,//hr' http://lib.ru/STERLINGB/shizmatrica.txt_with-big-pictures.html
# repub -x 'title://h2' -x 'toc://table' -x 'toc_item://a' -X 'div' -X 'table' -X '//hr' http://lib.ru/STERLINGB/shizmatrica.txt_with-big-pictures.html
#
# Айзек Азимов. Космические течения
# repub -B -v -x 'title://h2' -x 'toc://table' -x 'toc_item://a' -X 'div' -X 'table' -X '//hr' http://lib.ru/FOUNDATION/currspac.txt_with-big-pictures.html
#
# Git User's Manual
# repub -x 'title://h1' -x 'toc://div.toc/dl' -x 'toc_item:/dt' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
# with hierarchy
# repub -x 'title://h1' -x 'toc://div.toc'/dl -x 'toc_item:/dt' -x 'toc_section:/dl' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html
# repub -x 'title://h1' -x 'toc://div[@class="toc"]/dl' -x 'toc_item:dt' -x 'toc_section:following-sibling::*[1]/dl' http://www.kernel.org/pub/software/scm/git/docs/user-manual.html

Repub::App.instance.run ARGV
1 change: 0 additions & 1 deletion lib/repub/app.rb
Expand Up @@ -26,7 +26,6 @@ def self.data_path

def run(args)
parse_options(args)
exit

log.level = options[:verbosity]
log.info "Making ePub from #{options[:url]}"
Expand Down
33 changes: 19 additions & 14 deletions lib/repub/app/builder.rb
Expand Up @@ -104,32 +104,37 @@ def postprocess_file(asset)
end

def postprocess_doc(asset)

# XXX !!! TODO
#doc = Hpricot(open(asset), :xhtml_strict => @options[:fixup])
#

# Substitute custom stylesheet
doc = Nokogiri::HTML.parse(open(asset), nil, 'UTF-8')
# Substitute custom CSS
if (@options[:css] && !@options[:css].empty?)
doc.search('//link[@rel="stylesheet"]') do |link|
doc.xpath('//link[@rel="stylesheet"]') do |link|
link[:href] = File.basename(@options[:css])
log.debug "-- Replacing CSS refs with #{link[:href]}"
end
end
# Remove elements
if @options[:remove] && !@options[:remove].empty?
@options[:remove].each do |selector|
log.info "Removing element(s) matching selector \"#{selector}\""
log.info "Removing elements matching selector \"#{selector}\""
#p doc.search(selector).size
#p doc.search(selector)
doc.search(selector).remove
end
end
# Translate a name -> id
doc.search('//a[@name]') do |a|
a[:id] = a[:name]
end
# Overwrite asset with fixed version
# XXX
# doc.search('//a[@name and not(@id)]') do |a|
# a[:id] = a[:name]
# end
# Save processed version
File.open(asset, 'w') do |f|
f << doc.to_html
if @options[:fixup]
# HACK: Nokogiri seems to ignore the fact that xmlns and other attrs aleady present and adds them anyway
# So we just remove them here to avoid duplicates
doc.root.attributes.each {|name, value| doc.root.remove_attribute(name) }
doc.write_xhtml_to(f, :encoding => 'UTF-8')
else
doc.write_html_to(f, :encoding => 'UTF-8')
end
end
end

Expand Down
2 changes: 0 additions & 2 deletions lib/repub/app/options.rb
Expand Up @@ -158,14 +158,12 @@ def parse_options(args)
parser.parse! args
rescue OptionParser::ParseError => ex
log.fatal "ERROR: #{ex.to_s}. See '#{App.name} --help'."
exit 1
end

options[:url] = args.last
if options[:url].nil? || options[:url].empty?
help parser
log.fatal "ERROR: Please specify an URL."
exit 1
end
end

Expand Down
42 changes: 22 additions & 20 deletions lib/repub/app/parser.rb
Expand Up @@ -15,9 +15,9 @@ def parse(cache)
#
Selectors = {
:title => '//h1',
:toc => '//div.toc/ul',
:toc_item => '/li',
:toc_section => '/ul'
:toc => '//div[@class="toc"]/ul',
:toc_item => './li',
:toc_section => './ul'
}

class Parser
Expand All @@ -43,7 +43,7 @@ def parse(cache)
@cache = cache
@asset = @cache.assets[:documents][0]
log.debug "-- Parsing #{@asset}"
@doc = Nokogiri::HTML(open(File.join(@cache.path, @asset)))
@doc = Nokogiri::HTML.parse(open(File.join(@cache.path, @asset)), nil, 'UTF-8')

@uid = @cache.name
parse_title
Expand All @@ -64,7 +64,7 @@ def parse_title
if el.children.empty?
title_text = el.inner_text
else
title_text = el.children.map{|c| c.inner_text }.join(' ')
title_text = el.children.map{|c| c.inner_text }.join(' ')
end
@title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
log.info "Found title \"#{@title}\""
Expand Down Expand Up @@ -102,7 +102,7 @@ def src

def parse_toc
log.debug "-- Looking for TOC with #{@selectors[:toc]}"
el = @doc.at(@selectors[:toc])
el = @doc.xpath(@selectors[:toc]).first
if el
@toc = parse_toc_section(el)
log.info "Found TOC with #{@toc.size} top-level items"
Expand All @@ -115,24 +115,26 @@ def parse_toc
def parse_toc_section(section)
toc = []
log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
section.search(@selectors[:toc_item]).each do |item|
section.xpath(@selectors[:toc_item]).each do |item|
a = item.name == 'a' ? item : item.at('a')
next if !a
href = a[:href]
next if !a || !href
title = item.inner_text.gsub(/\s+/, ' ').strip
next if !href
if item.children.empty?
title = item.inner_text
else
title = item.children.map{|c| c.inner_text }.join(' ')
end
title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
log.debug "-- Found item: #{title}"
subitems = nil
subsections = item.search(@selectors[:toc_section])
#p "++ #{item.search(@selectors[:toc_section])}"
#p "== #{subsections.size}" if subsections
p "== #{item}" if subsections
p "== #{@selectors[:toc_section]} #{subsections}" if subsections
#p subsections.size if subsections
subsections.each do |subsection|
log.debug "-- Found section with #{@selectors[:toc_section]} >>>"
subsection = item.xpath(@selectors[:toc_section]).first
#p subsection
if subsection
log.debug "-- Found section with #{@selectors[:toc_section]}"
log.debug "-- >"
subitems = parse_toc_section(subsection)
log.debug '-- <<<'
end if subsections
log.debug '-- .'
end
toc << TocItem.new(title, href, subitems, @asset)
end
toc
Expand Down

0 comments on commit 8de76f9

Please sign in to comment.