The big kahuna: handling footnotes once the epub has been converted to html

In [2]:
import re
import bs4

In [3]:
debug = False
dprint = print if debug else lambda *args: None

In [122]:
fn_template = '<sup id="t{fn_num}">[{fn_num}](#f{fn_num})</sup>'
# Version with numbers, commented out since the numbers are "built into" the footnotes in Killing Hope,
# i.e. the span contains the numbers along with the actual text
#rfn_template = '{fn_num}. <span id="f{fn_num}" name="f{fn_num}">{fn_text}</span> [↩](#t{fn_num})'
rfn_template = '<span id="f{fn_num}" name="f{fn_num}">{fn_text}</span> [↩](#t{fn_num})'

In [5]:
fn_template.format(fn_num=7)

'<sup id="t7">[7](#f7)</sup>'

In [6]:
rfn_template.format(fn_num=11, fn_text="Cool footnote.")

'11. <span id="f11" name="f11">Cool footnote.</span> [↩](#t11)'

In [7]:
repl_dict = {
    '’': "'",
    '‘': "'",
    '“': '"',
    '”': '"',
    '—': '--',
    '…': '...'
}

In [8]:
with open("killing_hope.html", "r", encoding="utf-8") as f:
    html_str = f.read()

In [9]:
soup = bs4.BeautifulSoup(html_str, 'html.parser')

In [10]:
#soup

I guess the first thing to do is split into [front matter], [main text], and [footnotes]

Actually... So in the end I want a file for each chapter, where each of these files contains the chapter text and the footnotes, correctly [bidirectionally] linked to one another. So instead here I'm gonna try to get like a list where each element is the html of a chapter

In [11]:
toc_elts = soup.find_all(class_="toc")

In [13]:
ch_info = []
for elt_num, cur_elt in enumerate(toc_elts):
    #print(cur_elt)
    # Here we can get the id for the link
    elt_id = cur_elt['id']
    #print(elt_num, elt_id)
    # But the main thing is getting the href and the text of the link
    children = cur_elt.children
    # I think there should only be one child, so this should work
    child = next(children)
    # Grr. Nope, because sometimes the first child can be an anchor marking a new page
    if 'href' not in child.attrs:
        child = next(children)
    #print(child)
    #print(dir(elt_child))
    #print(elt_child.attrs)
    href = child.attrs['href']
    href_elts = href.split("#")
    html_file = href_elts[0]
    anchor_link = href_elts[1]
    # Important: here we also record this anchor_link as "next_anchor" for the previous
    # chapter's info
    if elt_num > 0:
        ch_info[elt_num-1]['next_anchor'] = anchor_link
    text = child.text
    # Replace annoying unicode chars in the text b/c I'm neurotic
    repl_unicode = lambda x: repl_dict[x] if x in repl_dict else x
    text = "".join([repl_unicode(c) for c in text])
    # And now get the chapter num if it has one
    ch_num = None
    ch_title = text
    ch_reg_result = ch_reg.search(text)
    if ch_reg_result is not None:
        # We have a ch_num
        ch_num = ch_reg_result.group(1)
        # And also update the title so it doesn't contain the ch num redundantly
        ch_title = ch_reg_result.group(2)
    #print(ch_num, href, text)
    ch_info.append({'ch_num':ch_num, 'ch_title': ch_title, 'href':href, 'html_file':html_file, 'anchor_link':anchor_link})

In [14]:
#ch_info

Cool. So now we have to basically loop over these individual links, parse the html files they link to, handle the footnotes, and export .md files

In [15]:
cur_ch_info = ch_info[10]
cur_ch_info

{'ch_num': '9',
 'ch_title': 'Iran 1953: Making it safe for the King of Kings',
 'href': 'part0013.html#ch9',
 'html_file': 'part0013.html',
 'anchor_link': 'ch9',
 'next_anchor': 'ch10'}

In [16]:
ch_start_elt = soup.find(id='ch9')
ch_start_elt

<p class="title1" id="ch9"><a class="calibre1" href="part0002.html#rch9">9. Iran 1953</a></p>

In [17]:
# Now I guess we go through siblings until we hit next_anchor?

In [18]:
ch_contents = []
for cur_ch_elt in ch_start_elt.next_siblings:
    try:
        cur_id = cur_ch_elt.attrs['id']
        if cur_id == 'ch10':
            break
        ch_contents.append(cur_ch_elt)
    except AttributeError as ae:
        # It's a NavigableString
        if repr(cur_ch_elt) != '\n':
            ch_contents.append(cur_ch_elt)
        # (otherwise we can throw it out, it's just a line break)
        #print(ae)
    except KeyError as ke:
        # It's an element without an id
        ch_contents.append(cur_ch_elt)
        #print(ke)

In [19]:
# Annoying but now we can filter out the NavigableStrings that are just linebreaks
ch_contents = [c for c in ch_contents if c != '\n']

In [20]:
md_contents = []

In [21]:
def parse_footnote(fn_elt):
    #print(fn_elt)
    fn_children = fn_elt.children
    # First child should be the return anchor
    return_anchor = next(fn_children)
    return_id = return_anchor.attrs['id']
    dprint(f"return_id = {return_id}")
    # And next should be the link to the footnote itself
    fn_link = next(fn_children)
    fn_href = fn_link.attrs['href']
    dprint(f"fn_href = {fn_href}")
    fn_text = fn_link.text
    # But really fn_text should be a number
    fn_num = int(fn_text)
    dprint(f"fn_num = {fn_num}")
    # Cool so now we should be able to convert it to our md format
    new_fn_html = fn_template.format(fn_num=fn_num)
    return new_fn_html, fn_href, return_id

In [22]:
chid_reg = re.compile(r'ch([0-9]{1,2})')

In [23]:
def parse_paragraph(p_elt):
    #print("---")
    #print("parse_paragraph()")
    #print(p_elt)
    pconv_buffer = ""
    fn_info = []
    # See if it's a header
    p_class = " ".join(p_elt.attrs['class'])
    if "title" in p_class:
        # Make it markdown header text
        pconv_buffer += "## "
        # And see if we can extract the chapter num as well
        ch_num_result = chid_reg.search(str(p_elt))
        if ch_num_result is not None:
            ch_num = ch_num_result.group(1)
            pconv_buffer += f"{ch_num}. "
    # Now... I'm gonna try to loop over children
    for se_num, sub_elt in enumerate(p_elt.children):
        #print(f"sub_elt #{se_num} ({type(sub_elt)}): {sub_elt}")
        # Seems like they're either Tags or NavigableStrings
        if type(sub_elt) == bs4.element.Tag:
            #print(f"**tag**: {sub_elt}")
            if sub_elt.name == "a":
                # A link! Just extract the text
                pconv_buffer += sub_elt.text
            elif sub_elt.name == "sup":
                # A footnote!
                fn_html, fn_href, return_id = parse_footnote(sub_elt)
                pconv_buffer += fn_html
                fn_info.append((fn_html, fn_href, return_id))
            elif sub_elt.name == "i":
                # Just italics
                pconv_buffer += f"*{sub_elt.text}*"
            elif sub_elt.name == "span":
                # This seems to be how it does blockquotes
                #print("\n\nspan?!?\n\n")
                #print(sub_elt)
                pconv_buffer += f"> "
            else:
                raise Exception("Unhandled tag")
        elif type(sub_elt) == bs4.element.NavigableString:
            #print(f"**NS**: {sub_elt}")
            pconv_buffer += str(sub_elt)
        else:
            raise Exception("smth bad happened")
    # The final step: removing all the annoying unicode chars
    remove_uc = lambda x: repl_dict[x] if x in repl_dict else x
    pconv_buffer = "".join([remove_uc(c) for c in pconv_buffer])
    #print("---")
    return pconv_buffer, fn_info

In [24]:
parsed_elts = []
for elt_num, cur_ch_elt in enumerate(ch_contents):
    #print(f"=====[Parsing chapter element #{elt_num}]=====")
    cur_tag = cur_ch_elt.name
    if cur_tag == "p":
        # Cool, it's a paragraph. Parse it.
        parsed = parse_paragraph(cur_ch_elt)
        #print(parsed)
        parsed_elts.append(parsed)
        #print(f"=====[end #{elt_num}]=====")

In [25]:
#parsed_elts
md_lines = [pe[0] for pe in parsed_elts]
md_str = "\n\n".join(md_lines)

In [26]:
with open("test_out.md", "w", encoding="utf-8") as g:
    g.write(md_str)

Cool so then I guess step 2 is going and extracting all the footnotes from the very end of the book, and appending them instead to the corresponding chapters

In [28]:
# First let's get all the footnote tuples for the ch into one big list
all_footnotes = []
for cur_elt in parsed_elts:
    footnote_list = cur_elt[1]
    if len(footnote_list) > 0:
        all_footnotes.extend(footnote_list)

In [30]:
all_footnotes[:5]

[('<sup id="t1">[1](#f1)</sup>', 'part0061_split_000.html#rfn1ch9', 'fn1ch9'),
 ('<sup id="t2">[2](#f2)</sup>', 'part0061_split_000.html#rfn2ch9', 'fn2ch9'),
 ('<sup id="t3">[3](#f3)</sup>', 'part0061_split_000.html#rfn3ch9', 'fn3ch9'),
 ('<sup id="t4">[4](#f4)</sup>', 'part0061_split_000.html#rfn4ch9', 'fn4ch9'),
 ('<sup id="t5">[5](#f5)</sup>', 'part0061_split_000.html#rfn5ch9', 'fn5ch9')]

In [48]:
fn_anchors = [fn_info[1].split("#")[1] for fn_info in all_footnotes]

In [68]:
soup.find(id=fn_anchors[7]).next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.attrs['id']

'rfn9ch9'

In [111]:
def extract_footnote(fn_anchor, next_anchor):
    """
    Can pass in next_anchor=None in which case we revert to the sketchier heuristic of just
    scanning until we find another tag with an id
    """
    full_fn = []
    fn_pointer = soup.find(id=fn_anchor)
    while fn_pointer is not None:
        # If it's a Tag we need to use .text, otherwise it's a NavigableString and we can just convert it to str
        elt_text = fn_pointer.text if type(fn_pointer) == bs4.element.Tag else str(fn_pointer)
        full_fn.append(elt_text)
        fn_pointer = fn_pointer.next_sibling
        # If next_anchor is None, we don't care about what the id is
        if next_anchor is None:
            at_next_fn = (type(fn_pointer) == bs4.element.Tag) and (('id' in fn_pointer.attrs) or (fn_pointer.name == "p"))
        else:
            # We want the id to specifically be next_anchor
            at_next_fn = (type(fn_pointer) == bs4.element.Tag) and ('id' in fn_pointer.attrs) and (fn_pointer.attrs['id'] == next_anchor)
        if at_next_fn:
            break
    # This is slightly better than a basic .replace() since it "collapses" all repeated
    # non-breaking spaces down to a single space
    nbspace_reg = re.compile(r'(\xa0)+')
    #"".join([fn_line.replace("\xa0"," ") for fn_line in full_fn7])
    fn_str = "".join([nbspace_reg.sub(" ", fn_line) for fn_line in full_fn])
    fn_str = fn_str.strip()
    return fn_str

In [120]:
#len(fn_anchors)

In [121]:
#extract_footnote(fn_anchors[47], None)

In [115]:
# Now we just find those anchors (#rfn1ch9, #rfn2ch9, ...) and extract their contents
all_fn_contents = []
for i in range(len(fn_anchors)):
    cur_anchor = fn_anchors[i]
    next_anchor = fn_anchors[i+1] if i+1 < len(fn_anchors) else None
    fn_contents = extract_footnote(cur_anchor, next_anchor)
    all_fn_contents.append((cur_anchor, fn_contents))
#results = [(fn_anchor, extract_footnote(fn_anchor)) for fn_anchor in fn_anchors]

In [117]:
all_fn_contents[:5]

[('rfn1ch9', '1. Roosevelt, p. 8.'),
 ('rfn2ch9', '2. Ibid., pp. 18-19.'),
 ('rfn3ch9',
  '3. Anthony Eden, The Memoirs of the Right Honourable Sir Anthony Eden: Full Circle (London, 1960) p. 194.'),
 ('rfn4ch9',
  '4. Dean Acheson, Present at the Creation: My Years in the State Department (New York, 1969) pp. 679-85; Eden, pp. 201-2: Nirumand, pp. 73-4.'),
 ('rfn5ch9', '5. Roosevelt, p. 107.')]

In [118]:
# Debugging stuff
#str(soup).index("fn1ch9")
#str(soup)[279600:279740]

Awesome. Now we format these extracted footnotes using the footnote template from very beginning of the notebook, and then append them to the end of the chapter doc

In [126]:
html_fns = []
for iter_num, cur_fn_contents in enumerate(all_fn_contents):
    # Footnotes start counting from 1, not 0
    fn_num = iter_num + 1
    fn_text = cur_fn_contents[1]
    fn_html = rfn_template.format(fn_num=fn_num, fn_text=fn_text)
    html_fns.append(fn_html)

In [129]:
# Two linebreaks b/c Markdown doesn't make new paragraphs for single linebreaks
all_fns_str = "\n\n".join(html_fns)

In [130]:
with open("test_fn_out.md", "w", encoding="utf-8") as g:
    g.write(all_fns_str)

Final step (?!?): combine the chapter output with the footnotes output

In [133]:
with open("test_out.md", "r", encoding="utf-8") as f:
    ch_contents = f.read()
with open("test_fn_out.md", "r", encoding="utf-8") as g:
    ch_footnotes = g.read()
ch_full = ch_contents + "\n\n# Footnotes\n\n" + ch_footnotes

In [135]:
ch_full[:500]

'## 9. Making it safe for the King of Kings\n\n"So this is how we get rid of that madman Mossadegh," announced John Foster Dulles to a group of top Washington policy makers one day in June 1953.<sup id="t1">[1](#f1)</sup> The Secretary of State held in his hand a plan of operation to overthrow the prime minister of Iran prepared by Kermit (Kim) Roosevelt of the CIA. There was scarcely any discussion amongst the high-powered men in the room, no probing questions, no legal or ethical issues raised.\n\n'

In [136]:
with open("test_full_out.md", "w", encoding="utf-8") as f:
    f.write(ch_full)