Skip to content

Commit

Permalink
post process to fix headers
Browse files Browse the repository at this point in the history
  • Loading branch information
interrogator committed Dec 12, 2019
1 parent ce14bab commit d7d3f8a
Showing 1 changed file with 22 additions and 3 deletions.
25 changes: 22 additions & 3 deletions scripts/epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,25 @@ def make_meta_element(metadata):
meta += f"{k.replace('_', '-')}={v} "
return meta + "/>"

def post_process(text):
out = []
header = False
lines = text.splitlines()
for line in lines:
if "<meta header=true" in line:
header = True
out.append("\n" + line)
continue
if header:
if "</meta" in line:
header = False
latest = out[-1]
out[-1] += line
else:
out.append(line)

return '\n'.join(out)


def convert(epub):
print("Processing %s ..." % epub)
Expand All @@ -161,7 +180,7 @@ def convert(epub):
toc = TocParser(file.read(ops + ncx)).parseToc()

# make corpus directory
outdir = make_safe_name(title)
outdir = os.path.join('out', make_safe_name(title))
os.makedirs(outdir)

# hold data in here
Expand All @@ -180,7 +199,6 @@ def convert(epub):
# iterate over components
for t in toc:
# make folder for each part
print("T", t.content)
if "epub_p" in t.content:
part_number += 1
part_name = t.text.strip()
Expand Down Expand Up @@ -210,7 +228,8 @@ def convert(epub):
html = file.read(ops + t.content.split("#")[0])
# todo: split out the chapter title, or no
text = html_parser.handle(html.decode("utf-8"))
print(text[:1000])

text = post_process(text)

with open(chapter_path, "w") as fo:
fo.write(meta_string + "\n")
Expand Down

0 comments on commit d7d3f8a

Please sign in to comment.