In [None]:
import re
import pandas as pd
from pathlib import Path

import markdown
from bs4 import BeautifulSoup
import string

import difflib
differ = difflib.HtmlDiff(wrapcolumn=100)
from IPython.display import display, HTML

from github_slugger import GithubSlugger

In [None]:
CONTENTS = Path('/home/jovyan/active-projects/macro-economics-textbook/contents/')
# Get github slugger

## Get Markdown and Infer Metadata

We infer metadata from the directory structure.  
`PosixPath('../contents/module-3/index.mdx')` Each module has a root-level index page  
`PosixPath('../contents/module-3/chapter-9/index.mdx')`  Each chapter has a root-level index page  
`PosixPath('../contents/module-3/chapter-9/section-1/index.mdx')`  Each section has a content page, also called index.mdx  


In [None]:
mdx_sections = []

verbose = False
for path in sorted(CONTENTS.glob('**/index.mdx')):
    rel_path = path.relative_to('/home/jovyan/active-projects/macro-economics-textbook/contents/')
    parents = reversed(rel_path.parents[:-1]) # get all parent directories before '../contents'. We omit ([:-1]) the top-level relative directory '.'
    module = next(parents).name.split('-')[1] # all .mdx files belong to a module
    chapter = next(parents, Path('-0')).name.split('-')[1] # if the iterator is exhausted, this will make the chapter number 0
    section = next(parents, Path('-0')).name.split('-')[1] # if the iterator is exhausted, this will make the section number 0
    if verbose:
        print(f'{path.as_posix():<60}Module {module}, Chapter {chapter}, Section {section}')
    mdx_sections.append({
        'module': module,
        'chapter': chapter,
        'section': section,
        'path': path,
    })

mdx_sections[:5]

## Parse with regex

We want to capture all content that is between two top-level headings.

We also want to exclude certain subsections that have little or no text content.

I hate regex so much. Let's break down the pattern below:  
1. `(?:^#{1,2} )` A non-capturing group that looks for a line that starts with 1 or 2 '#', and ensures that the next character is a space ' '
2. `(.*?)$` A capturing group that includes all characters until the end of the line. This is our subsection heading.
3. `\s*` matches on any any whitespace (optionally)
4. `(.*?)` A capturing group that includes all characters. This is the subsection text.
5. `(?=\s*^#|\Z)` A negative lookahead that tells us when to stop capturing subsection text. It will stop when it finds another subsection heading or the end of the document. It will include all the whitespace preceding one of these terminating elements, preventing that from being included in the subsection text capture group.

`re.DOTALL` allows the '.' character to match on newlines.  
`re.MULTILINE` makes the '^' and '$' anchors match on the beginning/end of lines instead of the beginning/end of the document. We use '\Z' to match the end of the document in multiline mode.


In [None]:
pattern = re.compile(r'(?:^#{1,2} )(.*?)$\s*(.*?)(?=\s*^#|\Z)', re.DOTALL | re.MULTILINE)

# all these are lowercased because capitalization is inconsistent across MDX files
subsections_to_skip = [
    'learn with videos',
    'please write your summary below',
    'please your write summary below', # ... a perfect example of why this approach is doomed. More than 10 sections have this typo.
    'bring it home',
    'clear it up',
    'work it out',
]

subsections = []

verbose = True
for section in mdx_sections:
    text = section['path'].read_text()
    matches = pattern.findall(text)
    for i, match in enumerate(matches):
        subsection_title = match[0]
        subsection_text = match[1]
        if subsection_title.lower().strip() in subsections_to_skip: # lowercase() and strip() because capitalization and spacing are inconsistent
            if verbose and len(subsection_text) > 10: # if verbose, print the longer sections that we will be EXcluding
                print('-'*80)
                print(subsection_title, '\n', subsection_text)
                print('_'*80)                
            continue
        elif verbose and len(subsection_text) < 100: # if verbose, print the shorter sections that we will still be INcluding
            print('-'*80)
            print(subsection_title, '\n', subsection_text)
            print('_'*80)
        else:
            subsection_dict = {
                **section, # add in section-level metadata
                'subsection': i,
                'heading': subsection_title,
                'raw_text': subsection_text,
            }
            subsection_dict.pop('path')
            subsections.append(subsection_dict)

In [None]:
df = pd.DataFrame(subsections)
df

## HTML Cleanup

First step is to clean up the markdown files into something closer to standard markdown.

RegEx is used to delete tables and their contents. Without the HTML tags, tables become a mangled list of strings.  
We also use RegEx to remove the javascript import statements and convert links to standard markdown formatting.

Next, we convert the markdown to HTML and use BeautifulSoup to extract the text.

Debugging and testing is done with difflib.

In [None]:
def diff(text, clean_text):
    text_lines = [s.strip() for s in text.splitlines() if s.strip()] # delete empty lines
    clean_text_lines = [s.strip() for s in clean_text.splitlines() if s.strip()] # delete empty lines
    diff = differ.make_file(text_lines, clean_text_lines, fromdesc='Original', todesc='HTML Parsed', context=False, numlines=0)
    display(HTML(diff))

In [None]:
exclamation_links_pattern = re.compile(r'!\[') # Removes the exclamation point in links ![link_text](url) --> [link_text](url)
html_table_pattern = re.compile(r'<Table.*?</Table>\s*', re.DOTALL | re.IGNORECASE) # remove table HTML and its contents
javascript_import_pattern = re.compile(r'^import.*?;', re.MULTILINE) # remove javascript imports

def clean_md(text_md):
    text_md = exclamation_links_pattern.sub(r'[', text_md)
    text_md = html_table_pattern.sub('', text_md)
    text_md = javascript_import_pattern.sub('', text_md)
    return text_md.strip()

In [None]:
def clean_raw_text(subsection_text_mdx):
    subsection_text_md = clean_md(subsection_text_mdx)
    subsection_text_html = markdown.markdown(subsection_text_md, extensions=['extra', 'tables'])
    subsection_text_html = html_table_pattern.sub('', subsection_text_html) # this gets any markdown tables that have now been converted to HTML
    return BeautifulSoup(subsection_text_html, features='html.parser').get_text().strip()

In [None]:
for text in df.raw_text.sample(5):
    clean_text = clean_raw_text(text)
    diff(text, clean_text)

In [None]:
df['clean_text'] = df.raw_text.apply(clean_raw_text)

## Generate Slugs

In [None]:
def make_slug_github(group):
    slugger = GithubSlugger()
    group['slug'] = group['heading'].apply(slugger.slug)
    return group

In [None]:
df = df.groupby(['module', 'chapter', 'section'], group_keys=False).apply(make_slug_github)

## Save

In [None]:
df.to_csv('../data/subsections.csv', index=False)