Permalink
Cannot retrieve contributors at this time
245 lines (201 sloc)
7.82 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| __license__ = 'GPL v3' | |
| __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>' | |
| ''' | |
| Code for the conversion of ebook formats and the reading of metadata | |
| from various formats. | |
| ''' | |
| import os, re, numbers, sys | |
| from calibre import prints | |
| from calibre.ebooks.chardet import xml_to_unicode | |
| class ConversionError(Exception): | |
| def __init__(self, msg, only_msg=False): | |
| Exception.__init__(self, msg) | |
| self.only_msg = only_msg | |
| class UnknownFormatError(Exception): | |
| pass | |
| class DRMError(ValueError): | |
| pass | |
| class ParserError(ValueError): | |
| pass | |
| BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'htm', 'xhtm', | |
| 'html', 'htmlz', 'xhtml', 'pdf', 'pdb', 'updb', 'pdr', 'prc', 'mobi', 'azw', 'doc', | |
| 'epub', 'fb2', 'fbz', 'djv', 'djvu', 'lrx', 'cbr', 'cb7', 'cbz', 'cbc', 'oebzip', | |
| 'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb', | |
| 'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi', 'docx', 'docm', 'md', | |
| 'textile', 'markdown', 'ibook', 'ibooks', 'iba', 'azw3', 'ps', 'kepub', 'kfx', 'kpf'] | |
| def return_raster_image(path): | |
| from calibre.utils.imghdr import what | |
| if os.access(path, os.R_OK): | |
| with open(path, 'rb') as f: | |
| raw = f.read() | |
| if what(None, raw) not in (None, 'svg'): | |
| return raw | |
| def extract_cover_from_embedded_svg(html, base, log): | |
| from calibre.ebooks.oeb.base import XPath, SVG, XLINK | |
| from calibre.utils.xml_parse import safe_xml_fromstring | |
| root = safe_xml_fromstring(html) | |
| svg = XPath('//svg:svg')(root) | |
| if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'): | |
| image = svg[0][0] | |
| href = image.get(XLINK('href'), None) | |
| if href: | |
| path = os.path.join(base, *href.split('/')) | |
| return return_raster_image(path) | |
| def extract_calibre_cover(raw, base, log): | |
| from calibre.ebooks.BeautifulSoup import BeautifulSoup | |
| soup = BeautifulSoup(raw) | |
| matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', | |
| 'font', 'br']) | |
| images = soup.findAll('img', src=True) | |
| if matches is None and len(images) == 1 and \ | |
| images[0].get('alt', '').lower()=='cover': | |
| img = images[0] | |
| img = os.path.join(base, *img['src'].split('/')) | |
| q = return_raster_image(img) | |
| if q is not None: | |
| return q | |
| # Look for a simple cover, i.e. a body with no text and only one <img> tag | |
| if matches is None: | |
| body = soup.find('body') | |
| if body is not None: | |
| text = ''.join(map(str, body.findAll(text=True))) | |
| if text.strip(): | |
| # Body has text, abort | |
| return | |
| images = body.findAll('img', src=True) | |
| if len(images) == 1: | |
| img = os.path.join(base, *images[0]['src'].split('/')) | |
| return return_raster_image(img) | |
| def render_html_svg_workaround(path_to_html, log, width=590, height=750): | |
| from calibre.ebooks.oeb.base import SVG_NS | |
| with open(path_to_html, 'rb') as f: | |
| raw = f.read() | |
| raw = xml_to_unicode(raw, strip_encoding_pats=True)[0] | |
| data = None | |
| if SVG_NS in raw: | |
| try: | |
| data = extract_cover_from_embedded_svg(raw, | |
| os.path.dirname(path_to_html), log) | |
| except Exception: | |
| pass | |
| if data is None: | |
| try: | |
| data = extract_calibre_cover(raw, os.path.dirname(path_to_html), log) | |
| except Exception: | |
| pass | |
| if data is None: | |
| data = render_html_data(path_to_html, width, height) | |
| return data | |
| def render_html_data(path_to_html, width, height): | |
| from calibre.ptempfile import TemporaryDirectory | |
| from calibre.utils.ipc.simple_worker import fork_job, WorkerError | |
| result = {} | |
| def report_error(text=''): | |
| prints('Failed to render', path_to_html, 'with errors:', file=sys.stderr) | |
| if text: | |
| prints(text, file=sys.stderr) | |
| if result and result['stdout_stderr']: | |
| with open(result['stdout_stderr'], 'rb') as f: | |
| prints(f.read(), file=sys.stderr) | |
| with TemporaryDirectory('-render-html') as tdir: | |
| try: | |
| result = fork_job('calibre.ebooks.render_html', 'main', args=(path_to_html, tdir, 'jpeg')) | |
| except WorkerError as e: | |
| report_error(e.orig_tb) | |
| else: | |
| if result['result']: | |
| with open(os.path.join(tdir, 'rendered.jpeg'), 'rb') as f: | |
| return f.read() | |
| else: | |
| report_error() | |
| def check_ebook_format(stream, current_guess): | |
| ans = current_guess | |
| if current_guess.lower() in ('prc', 'mobi', 'azw', 'azw1', 'azw3'): | |
| stream.seek(0) | |
| if stream.read(3) == b'TPZ': | |
| ans = 'tpz' | |
| stream.seek(0) | |
| return ans | |
| def normalize(x): | |
| if isinstance(x, str): | |
| import unicodedata | |
| x = unicodedata.normalize('NFC', x) | |
| return x | |
| def calibre_cover(title, author_string, series_string=None, | |
| output_format='jpg', title_size=46, author_size=36, logo_path=None): | |
| title = normalize(title) | |
| author_string = normalize(author_string) | |
| series_string = normalize(series_string) | |
| from calibre.ebooks.covers import calibre_cover2 | |
| from calibre.utils.img import image_to_data | |
| ans = calibre_cover2(title, author_string or '', series_string or '', logo_path=logo_path, as_qimage=True) | |
| return image_to_data(ans, fmt=output_format) | |
| UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc|rem|q)$') | |
| def unit_convert(value, base, font, dpi, body_font_size=12): | |
| ' Return value in pts' | |
| if isinstance(value, numbers.Number): | |
| return value | |
| try: | |
| return float(value) * 72.0 / dpi | |
| except: | |
| pass | |
| result = value | |
| m = UNIT_RE.match(value) | |
| if m is not None and m.group(1): | |
| value = float(m.group(1)) | |
| unit = m.group(2) | |
| if unit == '%': | |
| result = (value / 100.0) * base | |
| elif unit == 'px': | |
| result = value * 72.0 / dpi | |
| elif unit == 'in': | |
| result = value * 72.0 | |
| elif unit == 'pt': | |
| result = value | |
| elif unit == 'em': | |
| result = value * font | |
| elif unit in ('ex', 'en'): | |
| # This is a hack for ex since we have no way to know | |
| # the x-height of the font | |
| font = font | |
| result = value * font * 0.5 | |
| elif unit == 'pc': | |
| result = value * 12.0 | |
| elif unit == 'mm': | |
| result = value * 2.8346456693 | |
| elif unit == 'cm': | |
| result = value * 28.346456693 | |
| elif unit == 'rem': | |
| result = value * body_font_size | |
| elif unit == 'q': | |
| result = value * 0.708661417325 | |
| return result | |
| def parse_css_length(value): | |
| try: | |
| m = UNIT_RE.match(value) | |
| except TypeError: | |
| return None, None | |
| if m is not None and m.group(1): | |
| value = float(m.group(1)) | |
| unit = m.group(2) | |
| return value, unit.lower() | |
| return None, None | |
| def generate_masthead(title, output_path=None, width=600, height=60): | |
| from calibre.ebooks.conversion.config import load_defaults | |
| recs = load_defaults('mobi_output') | |
| masthead_font_family = recs.get('masthead_font', None) | |
| from calibre.ebooks.covers import generate_masthead | |
| return generate_masthead(title, output_path=output_path, width=width, height=height, font_family=masthead_font_family) | |
| def escape_xpath_attr(value): | |
| if '"' in value: | |
| if "'" in value: | |
| parts = re.split('("+)', value) | |
| ans = [] | |
| for x in parts: | |
| if x: | |
| q = "'" if '"' in x else '"' | |
| ans.append(q + x + q) | |
| return 'concat(%s)' % ', '.join(ans) | |
| else: | |
| return "'%s'" % value | |
| return '"%s"' % value |