Permalink
Browse files

Epub 3.0: Extract chapters from navigation document

  • Loading branch information...
1 parent 495a8c7 commit 46dbcdc31fe11f5e697cb87a432b50a61b2fb93d @klacointe committed May 11, 2012
Showing with 128 additions and 8 deletions.
  1. +58 −6 lib/formats/epub.rb
  2. BIN test/fixtures/epubs/epub3_nested_nav.epub
  3. +70 −2 test/formats/epub_test.rb
View
@@ -7,7 +7,8 @@ class Peregrin::Epub
:opf => { 'opf' => 'http://www.idpf.org/2007/opf' },
:dc => { 'dc' => 'http://purl.org/dc/elements/1.1/' },
:ncx => { 'ncx' => 'http://www.daisy.org/z3986/2005/ncx/' },
- :svg => { 'svg' => 'http://www.w3.org/2000/svg' }
+ :svg => { 'svg' => 'http://www.w3.org/2000/svg' },
+ :nav => { 'nav' => 'http://www.w3.org/1999/xhtml'}
}
OCF_PATH = "META-INF/container.xml"
HTML5_TAGNAMES = %w[section nav article aside hgroup header footer figure figcaption] # FIXME: Which to divify? Which to leave as-is?
@@ -85,7 +86,7 @@ def load_from_path(epub_path)
docs = load_config_documents(zipfile)
extract_properties(docs[:opf])
extract_components(zipfile, docs[:opf], docs[:opf_root])
- extract_chapters(zipfile, docs[:ncx])
+ extract_chapters(zipfile, {:ncx => docs[:ncx], :nav => docs[:nav]})
extract_cover(zipfile, docs)
}
@book.read_resource_proc = lambda { |resource|
@@ -117,9 +118,10 @@ def load_config_documents(zipfile)
end
# Extract Epub version
- @book.version = docs[:opf].at_xpath('//opf:package', NAMESPACES[:opf])['version']
+ @book.version = docs[:opf].at_xpath('//opf:package', NAMESPACES[:opf])['version'].to_f
- # The NCX file
+ # The NCX file.
+ # Must be present only with Ebook < 3.0 but can be use for forward compatibility
begin
spine = docs[:opf].at_xpath('//opf:spine', NAMESPACES[:opf])
ncx_id = spine['toc'] ? spine['toc'] : 'ncx'
@@ -132,7 +134,22 @@ def load_config_documents(zipfile)
ncx_content = zipfile.read(docs[:ncx_path])
docs[:ncx] = Nokogiri::XML::Document.parse(ncx_content)
rescue => e
- raise FailureLoadingNCX
+ # Only raise an exeption for Ebook with version lower than 3.0
+ raise FailureLoadingNCX if @book.version < 3
+ end
+
+ # The NAV file. (Epub3 only)
+ if @book.version >= 3
+ begin
+ docs[:nav_path] = from_opf_root(
+ docs[:opf_root],
+ docs[:opf].at_xpath("//opf:manifest/opf:item[@properties='nav']", NAMESPACES[:opf])['href']
+ )
+ nav_content = zipfile.read(docs[:nav_path])
+ docs[:nav] = Nokogiri::XML::Document.parse(nav_content)
+ rescue => e
+ raise FailureLoadingNAV
+ end
end
docs
@@ -212,8 +229,16 @@ def extract_components(zipfile, opf_doc, opf_root)
}
end
+ def extract_chapters(zipfile, docs)
+ if @book.version >= 3 && !docs[:nav].nil?
+ extract_nav_chapters(zipfile, docs[:nav])
+ else
+ extract_ncx_chapters(zipfile, docs[:ncx])
+ end
+ end
- def extract_chapters(zipfile, ncx_doc)
+ # Epub < 3.0 only
+ def extract_ncx_chapters(zipfile, ncx_doc)
curse = lambda { |point|
chp = Peregrin::Chapter.new(
point.at_xpath('.//ncx:text', NAMESPACES[:ncx]).content,
@@ -232,6 +257,32 @@ def extract_chapters(zipfile, ncx_doc)
}
end
+ # Epub >= 3.0 only
+ def extract_nav_chapters(zipfile, nav_doc)
+ curse = lambda { |point, position|
+ chp = Peregrin::Chapter.new(
+ point.at_xpath('.//nav:a', NAMESPACES[:nav]).content,
+ position,
+ point.at_xpath('.//nav:a', NAMESPACES[:nav])['href']
+ )
+ ol = point.at_xpath('.//nav:ol', NAMESPACES[:nav])
+ ol.children.each { |pt|
+ next unless pt.element? && pt.name == "li"
+ position += 1
+ position, chapter = curse.call(pt, position)
+ chp.children.push chapter
+ } if ol
+ [position, chp]
+ }
+ position = 0
+ nav_doc.at_xpath("//nav:nav/nav:ol", NAMESPACES[:nav]).children.each { |pt|
+ next unless pt.element? && pt.name == "li"
+ position += 1
+ position, chapter = curse.call(pt, position)
+ @book.chapters.push chapter
+ }
+ end
+
def extract_cover(zipfile, docs)
@book.cover = nil
@@ -556,5 +607,6 @@ class NotAZipArchive < ValidationError; end
class FailureLoadingOCF < ValidationError; end
class FailureLoadingOPF < ValidationError; end
class FailureLoadingNCX < ValidationError; end
+ class FailureLoadingNAV < ValidationError; end
end
Binary file not shown.
@@ -100,12 +100,80 @@ def test_extracting_epub3_fixed_layout_properties
def test_extracting_version
epub = Peregrin::Epub.read("test/fixtures/epubs/epub3_fixed_layout.epub")
- assert_equal(epub.to_book.version, '3.0')
+ assert_equal(3.0, epub.to_book.version)
epub = Peregrin::Epub.read("test/fixtures/epubs/strunk.epub")
- assert_equal(epub.to_book.version, '2.0')
+ assert_equal(2.0, epub.to_book.version)
end
+ def test_extracting_chapters_from_ocx
+ epub = Peregrin::Epub.read("test/fixtures/epubs/strunk.epub")
+ assert_equal(9, epub.to_book.chapters.count)
+ assert_equal("Title", epub.to_book.chapters.first.title)
+ assert_equal("title.xml", epub.to_book.chapters.first.src)
+ assert_equal(1, epub.to_book.chapters.first.position)
+ assert_equal("Recommendations", epub.to_book.chapters.last.title)
+ assert_equal("similar.xml", epub.to_book.chapters.last.src)
+ assert_equal(27, epub.to_book.chapters.last.position)
+ end
+
+ def test_extracting_chapters_from_nav
+ epub = Peregrin::Epub.read("test/fixtures/epubs/epub3_fixed_layout.epub")
+ assert_equal(3, epub.to_book.chapters.count)
+ assert_equal("Images and Text", epub.to_book.chapters.first.title)
+ assert_equal("page01.xhtml", epub.to_book.chapters.first.src)
+ assert_equal(1, epub.to_book.chapters.first.position)
+ assert_equal("Dragons", epub.to_book.chapters.last.title)
+ assert_equal("page04.xhtml", epub.to_book.chapters.last.src)
+ assert_equal(3, epub.to_book.chapters.last.position)
+ end
+
+ def test_extracting_nested_chapters_from_nav
+ epub = Peregrin::Epub.read("test/fixtures/epubs/epub3_nested_nav.epub")
+ assert_equal(11, epub.to_book.chapters.count)
+ assert_equal(
+ ["EPUB 3.0 Specification",
+ "EPUB 3 Specifications - Table of Contents",
+ "Terminology",
+ "EPUB 3 Overview",
+ "EPUB Publications 3.0",
+ "EPUB Content Documents 3.0",
+ "EPUB Media Overlays 3.0",
+ "Acknowledgements and Contributors",
+ "References",
+ "EPUB Open Container Format (OCF) 3.0",
+ "EPUB 3 Changes from EPUB 2.0.1"],
+ epub.to_book.chapters.map(&:title)
+ )
+ assert_equal(
+ [1, 2, 3, 4, 30, 85, 184, 230, 231, 232, 265],
+ epub.to_book.chapters.map(&:position)
+ )
+ assert_equal(
+ ["1. Introduction",
+ "2. Features",
+ "3. Global Language Support",
+ "4. Accessibility"],
+ epub.to_book.chapters[3].children.map(&:title)
+ )
+ assert_equal(
+ [5, 8, 22, 29],
+ epub.to_book.chapters[3].children.map(&:position)
+ )
+ assert_equal(
+ ["3.1. Metadata",
+ "3.2. Content Documents",
+ "3.3. CSS",
+ "3.4. Fonts",
+ "3.5. Text-to-speech",
+ "3.6. Container"],
+ epub.to_book.chapters[3].children[2].children.map(&:title)
+ )
+ assert_equal(
+ [23, 24, 25, 26, 27, 28],
+ epub.to_book.chapters[3].children[2].children.map(&:position)
+ )
+ end
def test_read_epub_to_write_epub
epub = Peregrin::Epub.read("test/fixtures/epubs/strunk.epub")

0 comments on commit 46dbcdc

Please sign in to comment.