Permalink
Browse files

parser: toc parsing

  • Loading branch information...
1 parent 46bbd69 commit 40a2e766b921e4a837b3a2782fe1c73b04e040e4 @invisiblellama committed Jul 6, 2009
View
@@ -30,4 +30,3 @@ depend_on 'nokogiri'
depend_on 'builder'
depend_on 'chardet'
depend_on 'launchy'
-depend_on 'mime-types'
View
@@ -50,15 +50,15 @@ def build(parser)
@ncx = Epub::NCX.new(@parser.uid)
@opf << @ncx
@ncx.title = @opf.metadata.title
- @ncx.nav_map << @parser.toc
+ @ncx.nav_map.points = @parser.toc
# Setup output filename and path
@output_path = File.expand_path(@options[:output_path].if_blank('.'))
if File.exist?(@output_path) && File.directory?(@output_path)
@output_path = File.join(@output_path, @opf.metadata.title.gsub(/\s/, '_'))
end
@output_path = @output_path + '.epub'
- log.debug "-- Setting output path to #{@output_path}"
+ log.debug "-- Output path is #{@output_path}"
# Build EPUB
tmpdir = Dir.mktmpdir(App::name)
View
@@ -27,7 +27,6 @@ class Parser
attr_reader :cache
attr_reader :uid
attr_reader :title
- attr_reader :title_html
attr_reader :toc
def initialize(options)
@@ -42,22 +41,20 @@ def parse(cache)
cache.assets[:documents].size > 1
@cache = cache
- @asset = @cache.assets[:documents][0]
- log.debug "-- Parsing #{@asset}"
- @doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @asset)), nil, 'UTF-8')
+ @document = @cache.assets[:documents][0]
+ log.debug "-- Parsing #{@document}"
+ @doc = Nokogiri::HTML.parse(IO.read(File.join(@cache.path, @document)), nil, 'UTF-8')
@uid = @cache.name
parse_title
- parse_title_html
+ #parse_title_html
parse_toc
self
end
private
- UNTITLED = 'Untitled'
-
def parse_title
log.debug "-- Looking for title with #{@selectors[:title]}"
el = @doc.at(@selectors[:title])
@@ -70,98 +67,67 @@ def parse_title
@title = title_text.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
log.info "Found title \"#{@title}\""
else
- @title = UNTITLED
+ @title = 'Untitled'
log.warn "** Could not find document title, using '#{@title}'"
end
end
- def parse_title_html
- log.debug "-- Looking for html title with #{@selectors[:title]}"
- el = @doc.at(@selectors[:title])
- @title_html = el ? el.inner_html.gsub(/[\r\n]/, '') : UNTITLED
- end
-
- # Helper container for TOC items
- #
- class TocItem < Struct.new(
- :title,
- :uri,
- :fragment_id
- )
+ class TocItem < Repub::Epub::NCX::NavPoint
- def initialize(title, uri_with_fragment_id, subitems, asset)
- self.title = title
- self.uri, self.fragment_id = uri_with_fragment_id.split(/#/)
- self.uri = asset if self.uri.empty?
- @subitems = subitems || []
- end
-
- attr_reader :subitems
-
- def src
- "#{uri}##{fragment_id}"
+ def initialize(title, uri_with_fragment_id, subitems, document)
+ uri, fragment_id = uri_with_fragment_id.split(/#/)
+ uri = document if uri.empty?
+ super(title, "#{uri}##{fragment_id}", subitems)
end
+
end
def parse_toc
+ @toc = []
+ depth = 0
+
+ l = lambda do |section|
+ toc_items = []
+ depth += 1
+ section.xpath(@selectors[:toc_item]).each do |item|
+ # Get item's anchor and href
+ a = item.name == 'a' ? item : item.at('a')
+ next if !a
+ href = a['href']
+ next if !href
+
+ # Is this a leaf item or node? Title parsing depends on that.
+ subsection = item.xpath(@selectors[:toc_section]).first
+ if subsection
+ # Item has subsection, use anchor text for title
+ title = a.inner_text
+ else
+ # Leaf item, glue inner_text from all children
+ title = item.children.map{|c| c.inner_text }.join(' ')
+ end
+ title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
+ log.debug "-- #{" " * depth}#{title}"
+
+ # Parse subsection
+ subitems = l.call(subsection) if subsection
+
+ toc_items << TocItem.new(title, href, subitems, @document)
+ end
+ depth -= 1
+ toc_items
+ end
+
log.debug "-- Looking for TOC with #{@selectors[:toc]}"
- el = @doc.xpath(@selectors[:toc]).first
- if el
- @toc = parse_toc_section(el)
+ toc_element = @doc.xpath(@selectors[:toc]).first
+
+ if toc_element
+ log.debug "-- Found TOC, parsing items with #{@selectors[:toc_item]} and sections with #{@selectors[:toc_section]}"
+ @toc = l.call(toc_element)
log.info "Found TOC with #{@toc.size} top-level items"
else
- @toc = []
log.warn "** Could not find document table of contents"
end
end
-
- def parse_toc_section(section)
- toc = []
- log.debug "-- Looking for TOC items with #{@selectors[:toc_item]}"
- section.xpath(@selectors[:toc_item]).each do |item|
- # Get item's anchor and href
- a = item.name == 'a' ? item : item.at('a')
- next if !a
- href = a['href']
- next if !href
- # Is this a leaf item or node ?
- subsection = item.xpath(@selectors[:toc_section]).first
- if subsection
- # Item has subsection, use anchor text for title
- title = a.inner_text
- else
- # Leaf item, glue inner_text from all children
- title = item.children.map{|c| c.inner_text }.join(' ')
- end
- title = title.gsub(/[\r\n]/, '').gsub(/\s+/, ' ').strip
- log.debug "-- Found item: #{title}"
- # Parse sub-section
- if subsection
- log.debug "-- Found section with #{@selectors[:toc_section]}"
- log.debug "-- >"
- subitems = parse_toc_section(subsection)
- log.debug '-- .'
- end
- toc << TocItem.new(title, href, subitems, @asset)
- end
- toc
- end
-
- # Monkey-patch NavMap to allow it to be set from Parser's TOC items collection
- #
- class << Repub::Epub::NCX::NavMap
- def <<(toc)
- l = lambda do |toc_items|
- toc_items.each do |toc_item|
- point = Repub::Epub::NCX::NavPoint.new(toc_item.title, toc_item.src)
- points << point
- l.call(toc_item.subitems) unless toc_item.subitems.empty?
- end
- end
- l.call(toc)
- end
- end
-
end
end
@@ -6,6 +6,7 @@ module Epub
# Mixin for stuff that can be added to the ePub package
#
module Containable
+ attr_accessor :id
attr_accessor :file_path
attr_accessor :media_type
@@ -20,8 +21,9 @@ def document?
class Item
include Containable
- def initialize(file_path, media_type = nil)
+ def initialize(file_path, id = nil, media_type = nil)
@file_path = file_path.strip
+ @id = id
@media_type = media_type || case @file_path.downcase
when /.*\.html?$/
'application/xhtml+xml'
View
@@ -1,13 +0,0 @@
-module Repub
- module Epub
-
- class Mimetype
- def self.save(path = 'mimetype')
- File.open(path, 'w') do |f|
- f << 'application/epub+zip'
- end
- end
- end
-
- end
-end
View
@@ -9,6 +9,7 @@ class NCX
def initialize(uid, file_path = 'toc.ncx')
@file_path = file_path
+ @id = 'ncx'
@media_type = 'application/x-dtbncx+xml'
@head = Head.new(uid)
@doc_title = DocTitle.new('Untitled')
@@ -27,7 +28,7 @@ def title=(text)
def to_xml
out = ''
- builder = Builder::XmlMarkup.new(:target => out)
+ builder = Builder::XmlMarkup.new(:target => out, :indent => 4)
builder.instruct!
builder.declare! :DOCTYPE, :ncx, :PUBLIC, "-//NISO//DTD ncx 2005-1//EN", "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd"
builder.ncx :xmlns => "http://www.daisy.org/z3986/2005/ncx/", :version => "2005-1" do
@@ -78,28 +79,34 @@ class NavPoint < Struct.new(
:src
)
- def initialize(title, src)
- super
- #@@last_play_order = 0
+ def initialize(title, src, points = nil)
+ super(title, src)
@play_order = 0
- @points = []
+ @points = points || []
end
attr_accessor :play_order
- attr_reader :points
+ attr_accessor :points
def to_xml(builder)
- builder.navPoint :id => @play_order.to_s, :playOrder => @play_order do
+ builder.navPoint :id => point_id(@play_order), :playOrder => @play_order do
builder.navLabel do
builder.text self.title
end
builder.content :src => self.src
@points.each { |point| point.to_xml(builder) }
end
end
+
+ private
+
+ def point_id(play_order)
+ "navPoint-#{play_order}"
+ end
end
class NavMap < NavPoint
+
def initialize
super(nil, nil)
@depth = 1
@@ -112,12 +119,11 @@ def calc_depth_and_play_order
l = lambda do |points, depth|
@depth = depth if depth > @depth
points.each do |point|
- point.play_order = play_order += 1
+ point.play_order = (play_order += 1)
l.call(point.points, depth + 1) unless point.points.empty?
end
end
- @depth = 1
- l.call(@points, @depth)
+ l.call(@points, @depth = 1)
end
def to_xml(builder)
View
@@ -86,7 +86,7 @@ def <<(item)
def to_xml
out = ''
- builder = Builder::XmlMarkup.new(:target => out)
+ builder = Builder::XmlMarkup.new(:target => out, :indent => 4)
builder.instruct!
builder.package :xmlns => "http://www.idpf.org/2007/opf",
'unique-identifier' => "dcidid",
@@ -106,18 +106,22 @@ def save
private
+ def item_id(index)
+ "item-#{index + 1}"
+ end
+
def manifest_to_xml(builder)
builder.manifest do
@items.each_with_index do |item, index|
- builder.item :id => index.to_s, :href => item.file_path, 'media-type' => item.media_type
+ builder.item :id => item_id(index), :href => item.file_path, 'media-type' => item.media_type
end
end
end
def spine_to_xml(builder)
builder.spine do
@items.each_with_index do |item, index|
- builder.itemref :idref => index.to_s if item.document?
+ builder.itemref :idref => item_id(index) if item.document?
end
end
end
View
@@ -36,10 +36,9 @@ def test_parser
assert_equal(3, parser.toc.size)
assert_equal('Chapter 1', parser.toc[0].title)
assert_equal('Chapter 3', parser.toc[2].title)
- assert_equal(2, parser.toc[0].subitems.size)
- assert_equal('Chapter 1.2', parser.toc[0].subitems[1].title)
- assert_equal(cache.assets[:documents][0], parser.toc[0].subitems[1].uri)
- assert_equal('c12', parser.toc[0].subitems[1].fragment_id)
+ assert_equal(2, parser.toc[0].points.size)
+ assert_equal('Chapter 1.2', parser.toc[0].points[1].title)
+ assert_equal("#{cache.assets[:documents][0]}#c12", parser.toc[0].points[1].src)
end
end

0 comments on commit 40a2e76

Please sign in to comment.