Skip to content

Commit

Permalink
update epub
Browse files Browse the repository at this point in the history
  • Loading branch information
interrogator committed Dec 12, 2019
1 parent d7d3f8a commit 8df869a
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 9 deletions.
45 changes: 36 additions & 9 deletions scripts/epub.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

import json
import os
import sys
import urllib
Expand Down Expand Up @@ -80,12 +81,14 @@ def parseBook(self):
parser.EndElementHandler = self.endElement
parser.CharacterDataHandler = self.characters
parser.Parse(self.xml, 1)
print('NC', self.ncx)
return self.title, self.author, self.ncx


class NavPoint:
def __init__(self, id=None, playorder=None, level=0, content=None, text=None):
def __init__(self, id=None, playorder=None, level=0, content=None, text=None, classy=None):
self.id = id
self.cls = classy
self.content = content
self.playorder = playorder
self.level = level
Expand All @@ -103,7 +106,7 @@ def __init__(self, xmlcontent=None):
def startElement(self, name, attributes):
if name == "navPoint":
level = len(self.stack)
self.currentNP = NavPoint(attributes["id"], attributes["playOrder"], level)
self.currentNP = NavPoint(attributes["id"], attributes["playOrder"], level, classy=attributes.get("class"))
self.stack.append(self.currentNP)
self.toc.append(self.currentNP)
elif name == "content":
Expand Down Expand Up @@ -140,6 +143,7 @@ def make_meta_element(metadata):
meta += f"{k.replace('_', '-')}={v} "
return meta + "/>"


def post_process(text):
out = []
header = False
Expand All @@ -152,15 +156,20 @@ def post_process(text):
if header:
if "</meta" in line:
header = False
latest = out[-1]
out[-1] += line
else:
out.append(line)

return '\n'.join(out)


def convert(epub):
def convert(epub, metafile=None):

text_meta = dict()
if metafile:
with open(metafile, "r") as fo:
text_meta = json.loads(fo.read())

print("Processing %s ..." % epub)
# open zip
file = zipfile.ZipFile(epub, "r")
Expand All @@ -180,7 +189,10 @@ def convert(epub):
toc = TocParser(file.read(ops + ncx)).parseToc()

# make corpus directory
outdir = os.path.join('out', make_safe_name(title))
safe_title = make_safe_name(title)
if metafile and safe_title in text_meta:
meta.update(text_meta[safe_title])
outdir = os.path.join('out', safe_title)
os.makedirs(outdir)

# hold data in here
Expand All @@ -196,10 +208,22 @@ def convert(epub):

not_chapters = {"copyright", "cover", "contents", "editor's note", "editors' note", "editor’s note", title.lower()}

is_part = {"epub3_p", "epub_p"}
is_chapter = {"epub_c", "epub3_c", "-h-", "index_split_"}

has_no_chapters = True
for t in toc:
if not t.level:
continue
if t.content.startswith("ch") or any(i in t.content for i in is_chapter):
has_no_chapters = False
break

# iterate over components
for t in toc:
print("ID", t.level, t.cls, t.content, t.id)
# make folder for each part
if "epub_p" in t.content:
if t.content.startswith("part") or any(i in t.content for i in is_part) and "epub_prl" not in t.content:
part_number += 1
part_name = t.text.strip()
numfilled = str(part_number).zfill(3)
Expand All @@ -210,7 +234,7 @@ def convert(epub):
meta.update(dict(part_name=part_name, part_number=part_number))
part_paths.append(part_path)
# make file containing chapter
elif "epub_c" in t.content or "-h-" in t.content:
if t.content.startswith("ch") or any(i in t.content for i in is_chapter) or has_no_chapters or t.cls == "chapter":
chapter_name = t.text.strip().strip('.')
if chapter_name.lower() in not_chapters:
continue
Expand All @@ -230,13 +254,16 @@ def convert(epub):
text = html_parser.handle(html.decode("utf-8"))

text = post_process(text)
# print('TEXT', text[:1000])

with open(chapter_path, "w") as fo:
fo.write(meta_string + "\n")
fo.write(text + "\n")


if __name__ == "__main__":
filenames = glob(sys.argv[1])
filenames = glob(sys.argv[-1])
metafile = sys.argv[1]
metafile = metafile if metafile != "none" else None
for filename in filenames:
convert(filename)
convert(filename, metafile)
20 changes: 20 additions & 0 deletions scripts/rushdie.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@

{
"fury": {"year": "2001"},
"grimus": {"year": "1975"},
"haroun-and-the-sea-of-stories-puffin-books": {"year": "1990"},
"imaginary-homelands": {"year": "1992"},
"joseph-anton": {"year": "2012"},
"luka-and-the-fire-of-life": {"year": "2010"},
"midnights-children": {"year": "1981"},
"shalimar-the-clown": {"year": "2005"},
"shame": {"year": "1983"},
"step-across-this-line": {"year": "2002"},
"the-duniazát": {"year": "2015"},
"the-enchantress-of-florence": {"year": "2008"},
"the-golden-house": {"year": "2017"},
"the-ground-beneath-her-feet": {"year": "1999"},
"the-jaguar-smile": {"year": "1987"},
"the-moors-last-sigh": {"year": "1995"},
"two-years-eight-months-and-twenty-eight-nights": {"year": "2015"}
}

0 comments on commit 8df869a

Please sign in to comment.