Skip to content

Commit

Permalink
current
Browse files Browse the repository at this point in the history
  • Loading branch information
interrogator committed Dec 12, 2019
1 parent cadd273 commit ce14bab
Showing 1 changed file with 13 additions and 11 deletions.
24 changes: 13 additions & 11 deletions scripts/epub.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@
import os
import sys
import urllib
import xml.parsers.expat
import zipfile
from glob import glob

import xml.parsers.expat
import html2text
from glob import glob


def make_safe_name(name):
safe_name = name.replace(" ", "-").lower()
safe_name = "".join(i for i in safe_name if i.isalnum() or i in {"-", "_"}).lower()
return urllib.parse.quote_plus(safe_name)
return safe_name


class ContainerParser:
Expand Down Expand Up @@ -169,15 +169,18 @@ def convert(epub):

html_parser = html2text.HTML2Text()
html_parser.body_width = 0 # no shitty wrapping
html_parser.ignore_images = True
html_parser.ignore_links = True
chapter_number = 0
part_number = 0
part_paths = []

not_chapters = {"copyright", "cover"}
not_chapters = {"copyright", "cover", "contents", "editor's note", "editors' note", "editor’s note", title.lower()}

# iterate over components
for t in toc:
# make folder for each part
print("T", t.content)
if "epub_p" in t.content:
part_number += 1
part_name = t.text.strip()
Expand All @@ -189,26 +192,25 @@ def convert(epub):
meta.update(dict(part_name=part_name, part_number=part_number))
part_paths.append(part_path)
# make file containing chapter
elif "epub_c" in t.content:
chapter_name = t.text.strip()
elif "epub_c" in t.content or "-h-" in t.content:
chapter_name = t.text.strip().strip('.')
if chapter_name.lower() in not_chapters:
continue
chapter_number += 1
numfilled = str(chapter_number).zfill(3)
safe_name = make_safe_name(chapter_name)
meta.update(dict(chapter_name=chapter_name, chapter_number=chapter_number))
meta_string = make_meta_element(meta)
chapter_path = f"{numfilled}-{safe_name}"
if part_paths:
part_path = part_paths[-1]
chapter_path = f"{numfilled}-{safe_name}"
chapter_path = os.path.join(part_path, chapter_path + ".txt")
chapter_path = os.path.join(part_paths[-1], chapter_path + ".txt")
else:
part_path = None
chapter_path = os.path.join(outdir, chapter_name)
chapter_path = os.path.join(outdir, chapter_path + ".txt")

html = file.read(ops + t.content.split("#")[0])
# todo: split out the chapter title, or no
text = html_parser.handle(html.decode("utf-8"))
print(text[:1000])

with open(chapter_path, "w") as fo:
fo.write(meta_string + "\n")
Expand Down

0 comments on commit ce14bab

Please sign in to comment.