The big kahuna: handling footnotes once the epub has been converted to html

In [163]:
import re
import bs4

In [243]:
debug = False
dprint = print if debug else lambda *args: None

In [244]:
fn_template = '<sup id="t{fn_num}">[{fn_num}](#f{fn_num})</sup>'
rfn_template = '{fn_num}. <span id="f{fn_num}" name="f{fn_num}">{fn_text}</span> [↩](#t{fn_num})'

In [245]:
fn_template.format(fn_num=7)

'<sup id="t7">[7](#f7)</sup>'

In [246]:
rfn_template.format(fn_num=11, fn_text="Cool footnote.")

'11. <span id="f11" name="f11">Cool footnote.</span> [↩](#t11)'

In [247]:
repl_dict = {
    '’': "'",
    '‘': "'",
    '“': '"',
    '”': '"',
    '—': '--',
    '…': '...'
}

In [248]:
with open("killing_hope.html", "r", encoding="utf-8") as f:
    html_str = f.read()

In [249]:
soup = bs4.BeautifulSoup(html_str, 'html.parser')

In [250]:
#soup

I guess the first thing to do is split into [front matter], [main text], and [footnotes]

Actually... So in the end I want a file for each chapter, where each of these files contains the chapter text and the footnotes, correctly [bidirectionally] linked to one another. So instead here I'm gonna try to get like a list where each element is the html of a chapter

In [251]:
toc_elts = soup.find_all(class_="toc")

In [252]:
ch_reg = re.compile(r'([0-9]+)\. (.+)$')

In [253]:
ch_info = []
for elt_num, cur_elt in enumerate(toc_elts):
    #print(cur_elt)
    # Here we can get the id for the link
    elt_id = cur_elt['id']
    #print(elt_num, elt_id)
    # But the main thing is getting the href and the text of the link
    children = cur_elt.children
    # I think there should only be one child, so this should work
    child = next(children)
    # Grr. Nope, because sometimes the first child can be an anchor marking a new page
    if 'href' not in child.attrs:
        child = next(children)
    #print(child)
    #print(dir(elt_child))
    #print(elt_child.attrs)
    href = child.attrs['href']
    href_elts = href.split("#")
    html_file = href_elts[0]
    anchor_link = href_elts[1]
    # Important: here we also record this anchor_link as "next_anchor" for the previous
    # chapter's info
    if elt_num > 0:
        ch_info[elt_num-1]['next_anchor'] = anchor_link
    text = child.text
    # Replace annoying unicode chars in the text b/c I'm neurotic
    repl_unicode = lambda x: repl_dict[x] if x in repl_dict else x
    text = "".join([repl_unicode(c) for c in text])
    # And now get the chapter num if it has one
    ch_num = None
    ch_title = text
    ch_reg_result = ch_reg.search(text)
    if ch_reg_result is not None:
        # We have a ch_num
        ch_num = ch_reg_result.group(1)
        # And also update the title so it doesn't contain the ch num redundantly
        ch_title = ch_reg_result.group(2)
    #print(ch_num, href, text)
    ch_info.append({'ch_num':ch_num, 'ch_title': ch_title, 'href':href, 'html_file':html_file, 'anchor_link':anchor_link})

In [254]:
#ch_info

Cool. So now we have to basically loop over these individual links, parse the html files they link to, handle the footnotes, and export .md files

In [255]:
cur_ch_info = ch_info[10]
cur_ch_info

{'ch_num': '9',
 'ch_title': 'Iran 1953: Making it safe for the King of Kings',
 'href': 'part0013.html#ch9',
 'html_file': 'part0013.html',
 'anchor_link': 'ch9',
 'next_anchor': 'ch10'}

In [256]:
ch_start_elt = soup.find(id='ch9')
ch_start_elt

<p class="title1" id="ch9"><a class="calibre1" href="part0002.html#rch9">9. Iran 1953</a></p>

In [257]:
# Now I guess we go through siblings until we hit next_anchor?

In [258]:
ch_contents = []
for cur_ch_elt in ch_start_elt.next_siblings:
    try:
        cur_id = cur_ch_elt.attrs['id']
        if cur_id == 'ch10':
            break
        ch_contents.append(cur_ch_elt)
    except AttributeError as ae:
        # It's a NavigableString
        if repr(cur_ch_elt) != '\n':
            ch_contents.append(cur_ch_elt)
        # (otherwise we can throw it out, it's just a line break)
        #print(ae)
    except KeyError as ke:
        # It's an element without an id
        ch_contents.append(cur_ch_elt)
        #print(ke)

In [259]:
# Annoying but now we can filter out the NavigableStrings that are just linebreaks
ch_contents = [c for c in ch_contents if c != '\n']

In [260]:
md_contents = []

In [261]:
def parse_footnote(fn_elt):
    #print(fn_elt)
    fn_children = fn_elt.children
    # First child should be the return anchor
    return_anchor = next(fn_children)
    return_id = return_anchor.attrs['id']
    dprint(f"return_id = {return_id}")
    # And next should be the link to the footnote itself
    fn_link = next(fn_children)
    fn_href = fn_link.attrs['href']
    dprint(f"fn_href = {fn_href}")
    fn_text = fn_link.text
    # But really fn_text should be a number
    fn_num = int(fn_text)
    dprint(f"fn_num = {fn_num}")
    # Cool so now we should be able to convert it to our md format
    new_fn_html = fn_template.format(fn_num=fn_num)
    return new_fn_html, fn_href, return_id

In [274]:
chid_reg = re.compile(r'ch([0-9]{1,2})')

In [279]:
def parse_paragraph(p_elt):
    #print("---")
    #print("parse_paragraph()")
    #print(p_elt)
    pconv_buffer = ""
    fn_info = []
    # See if it's a header
    p_class = " ".join(p_elt.attrs['class'])
    if "title" in p_class:
        # Make it markdown header text
        pconv_buffer += "## "
        # And see if we can extract the chapter num as well
        ch_num_result = chid_reg.search(str(p_elt))
        if ch_num_result is not None:
            ch_num = ch_num_result.group(1)
            pconv_buffer += f"{ch_num}. "
    # Now... I'm gonna try to loop over children
    for se_num, sub_elt in enumerate(p_elt.children):
        #print(f"sub_elt #{se_num} ({type(sub_elt)}): {sub_elt}")
        # Seems like they're either Tags or NavigableStrings
        if type(sub_elt) == bs4.element.Tag:
            #print(f"**tag**: {sub_elt}")
            if sub_elt.name == "a":
                # A link! Just extract the text
                pconv_buffer += sub_elt.text
            elif sub_elt.name == "sup":
                # A footnote!
                fn_html, fn_href, return_id = parse_footnote(sub_elt)
                pconv_buffer += fn_html
                fn_info.append((fn_html, fn_href, return_id))
            elif sub_elt.name == "i":
                # Just italics
                pconv_buffer += f"*{sub_elt.text}*"
            elif sub_elt.name == "span":
                # This seems to be how it does blockquotes
                #print("\n\nspan?!?\n\n")
                #print(sub_elt)
                pconv_buffer += f"> "
            else:
                raise Exception("Unhandled tag")
        elif type(sub_elt) == bs4.element.NavigableString:
            #print(f"**NS**: {sub_elt}")
            pconv_buffer += str(sub_elt)
        else:
            raise Exception("smth bad happened")
    # The final step: removing all the annoying unicode chars
    remove_uc = lambda x: repl_dict[x] if x in repl_dict else x
    pconv_buffer = "".join([remove_uc(c) for c in pconv_buffer])
    #print("---")
    return pconv_buffer, fn_info

In [280]:
parsed_elts = []
for elt_num, cur_ch_elt in enumerate(ch_contents):
    #print(f"=====[Parsing chapter element #{elt_num}]=====")
    cur_tag = cur_ch_elt.name
    if cur_tag == "p":
        # Cool, it's a paragraph. Parse it.
        parsed = parse_paragraph(cur_ch_elt)
        #print(parsed)
        parsed_elts.append(parsed)
        #print(f"=====[end #{elt_num}]=====")

In [281]:
#parsed_elts
md_lines = [pe[0] for pe in parsed_elts]
md_str = "\n\n".join(md_lines)

In [282]:
with open("test_out.md", "w", encoding="utf-8") as g:
    g.write(md_str)