In [1]:
import re
import pandas as pd
from pathlib import Path

import markdown
from bs4 import BeautifulSoup
import string

import difflib
differ = difflib.HtmlDiff(wrapcolumn=100)
from IPython.display import display, HTML

from github_slugger import GithubSlugger

In [2]:
CONTENTS = Path('/home/jovyan/active-projects/macro-economics-textbook/contents/')
# Get github slugger

## Get Markdown and Infer Metadata

We infer metadata from the directory structure.  
`PosixPath('../contents/module-3/index.mdx')` Each module has a root-level index page  
`PosixPath('../contents/module-3/chapter-9/index.mdx')`  Each chapter has a root-level index page  
`PosixPath('../contents/module-3/chapter-9/section-1/index.mdx')`  Each section has a content page, also called index.mdx  


In [3]:
mdx_sections = []

verbose = False
for path in sorted(CONTENTS.glob('**/index.mdx')):
    rel_path = path.relative_to('/home/jovyan/active-projects/macro-economics-textbook/contents/')
    parents = reversed(rel_path.parents[:-1]) # get all parent directories before '../contents'. We omit ([:-1]) the top-level relative directory '.'
    module = next(parents).name.split('-')[1] # all .mdx files belong to a module
    chapter = next(parents, Path('-0')).name.split('-')[1] # if the iterator is exhausted, this will make the chapter number 0
    section = next(parents, Path('-0')).name.split('-')[1] # if the iterator is exhausted, this will make the section number 0
    if verbose:
        print(f'{path.as_posix():<60}Module {module}, Chapter {chapter}, Section {section}')
    mdx_sections.append({
        'module': module,
        'chapter': chapter,
        'section': section,
        'path': path,
    })

mdx_sections[:5]

[{'module': '1',
  'chapter': '1',
  'section': '0',
  'path': PosixPath('/home/jovyan/active-projects/macro-economics-textbook/contents/module-1/chapter-1/index.mdx')},
 {'module': '1',
  'chapter': '1',
  'section': '1',
  'path': PosixPath('/home/jovyan/active-projects/macro-economics-textbook/contents/module-1/chapter-1/section-1/index.mdx')},
 {'module': '1',
  'chapter': '1',
  'section': '2',
  'path': PosixPath('/home/jovyan/active-projects/macro-economics-textbook/contents/module-1/chapter-1/section-2/index.mdx')},
 {'module': '1',
  'chapter': '1',
  'section': '3',
  'path': PosixPath('/home/jovyan/active-projects/macro-economics-textbook/contents/module-1/chapter-1/section-3/index.mdx')},
 {'module': '1',
  'chapter': '1',
  'section': '4',
  'path': PosixPath('/home/jovyan/active-projects/macro-economics-textbook/contents/module-1/chapter-1/section-4/index.mdx')}]

## Parse with regex

We want to capture all content that is between two top-level headings.

We also want to exclude certain subsections that have little or no text content.

I hate regex so much. Let's break down the pattern below:  
1. `(?:^#{1,2} )` A non-capturing group that looks for a line that starts with 1 or 2 '#', and ensures that the next character is a space ' '
2. `(.*?)$` A capturing group that includes all characters until the end of the line. This is our subsection heading.
3. `\s*` matches on any any whitespace (optionally)
4. `(.*?)` A capturing group that includes all characters. This is the subsection text.
5. `(?=\s*^#|\Z)` A negative lookahead that tells us when to stop capturing subsection text. It will stop when it finds another subsection heading or the end of the document. It will include all the whitespace preceding one of these terminating elements, preventing that from being included in the subsection text capture group.

`re.DOTALL` allows the '.' character to match on newlines.  
`re.MULTILINE` makes the '^' and '$' anchors match on the beginning/end of lines instead of the beginning/end of the document. We use '\Z' to match the end of the document in multiline mode.


In [4]:
pattern = re.compile(r'(?:^#{1,2} )(.*?)$\s*(.*?)(?=\s*^#|\Z)', re.DOTALL | re.MULTILINE)

# all these are lowercased because capitalization is inconsistent across MDX files
subsections_to_skip = [
    'learn with videos',
    'please write your summary below',
    'please your write summary below', # ... a perfect example of why this approach is doomed. More than 10 sections have this typo.
    'bring it home',
    'clear it up',
    'work it out',
]

subsections = []

verbose = False
for section in mdx_sections:
    text = section['path'].read_text()
    matches = pattern.findall(text)
    for i, match in enumerate(matches):
        subsection_title = match[0]
        subsection_text = match[1]
        if subsection_title.lower().strip() in subsections_to_skip: # lowercase() and strip() because capitalization and spacing are inconsistent
            if verbose and len(subsection_text) > 10: # if verbose, print the longer sections that we will be EXcluding
                print('-'*80)
                print(subsection_title, '\n', subsection_text)
                print('_'*80)                
            continue
        elif verbose and len(subsection_text) < 100: # if verbose, print the shorter sections that we will still be INcluding
            print('-'*80)
            print(subsection_title, '\n', subsection_text)
            print('_'*80)
        else:
            subsection_dict = {
                **section, # add in section-level metadata
                'subsection': i,
                'heading': subsection_title,
                'raw_text': subsection_text,
            }
            subsection_dict.pop('path')
            subsections.append(subsection_dict)

In [5]:
df = pd.DataFrame(subsections)
df

Unnamed: 0,module,chapter,section,subsection,heading,raw_text
0,1,1,0,0,Decisions ... Decisions in the Social Media Age,Every day we are faced with a myriad of decisi...
1,1,1,0,1,Introduction,What is economics and why should you spend you...
2,1,1,1,0,Overview,"import Alert from ""react-bootstrap/Alert"";\nim..."
3,1,1,1,1,Introduction to FRED,Data is very important in economics because it...
4,1,1,1,2,The Problem of Scarcity,"Think about all the things you consume: food, ..."
...,...,...,...,...,...,...
518,5,20,4,2,The World Trade Organization,The World Trade Organization (WTO) was officia...
519,5,20,4,3,Regional Trading Agreements,There are different types of economic integrat...
520,5,20,4,5,Trade Policy at the National Level,"Yet another dimension of trade policy, along w..."
521,5,20,4,6,Long-Term Trends in Barriers to Trade,"In newspaper headlines, trade policy appears m..."


## HTML Cleanup

First step is to clean up the markdown files into something closer to standard markdown.

RegEx is used to delete tables and their contents. Without the HTML tags, tables become a mangled list of strings.  
We also use RegEx to remove the javascript import statements and convert links to standard markdown formatting.

Next, we convert the markdown to HTML and use BeautifulSoup to extract the text.

Debugging and testing is done with difflib.

In [6]:
def diff(text, clean_text):
    text_lines = [s.strip() for s in text.splitlines() if s.strip()] # delete empty lines
    clean_text_lines = [s.strip() for s in clean_text.splitlines() if s.strip()] # delete empty lines
    diff = differ.make_file(text_lines, clean_text_lines, fromdesc='Original', todesc='HTML Parsed', context=False, numlines=0)
    display(HTML(diff))

In [34]:
exclamation_links_pattern = re.compile(r'!\[') # Removes the exclamation point in links ![link_text](url) --> [link_text](url)
html_table_pattern = re.compile(r'<Table.*?</Table>\s*', re.DOTALL | re.IGNORECASE) # remove table HTML and its contents
javascript_import_pattern = re.compile(r'^import.*?;', re.MULTILINE) # remove javascript imports

def clean_md(text_md):
    text_md = exclamation_links_pattern.sub(r'[', text_md)
    text_md = html_table_pattern.sub('', text_md)
    text_md = javascript_import_pattern.sub('', text_md)
    return text_md.strip()

In [35]:
def clean_raw_text(subsection_text_mdx):
    subsection_text_md = clean_md(subsection_text_mdx)
    subsection_text_html = markdown.markdown(subsection_text_md, extensions=['extra', 'tables'])
    subsection_text_html = html_table_pattern.sub('', subsection_text_html) # this gets any markdown tables that have now been converted to HTML
    return BeautifulSoup(subsection_text_html, features='html.parser').get_text().strip()

In [37]:
for text in df.raw_text.sample(5):
    clean_text = clean_raw_text(text)
    diff(text, clean_text)

Unnamed: 0,Original,Original.1,Unnamed: 3,HTML Parsed,HTML Parsed.1
t,1,<br />,t,,
,2,"import Card from ""react-bootstrap/Card"";",,,
,3,"<Card className=""bg-dark text-white"">",,,
,4,<Card.Img,,,
,5,"src=""/module1/tvcTdW9Wkz1Yr80K_3BvDFDJ8OaHfQtIP.jpeg""",,,
,6,"alt=""Card image""",,,
,7,/>,,,
,8,"<Card.ImgOverlay className = ""d-flex align-items-center"">",,,
,9,"<Card.Text style={{fontSize: ""1.3em""}}>",,,
,10,Visit this [website](https://www.adb.org/) to read about the Asian Development Bank.,,1.0,Visit this website to read about the Asian Development Bank.

Legends,Legends.1
Colors Added Changed Deleted,Links (f)irst change (n)ext change (t)op

Colors
Added
Changed
Deleted

Links,Links.1
(f)irst change,
(n)ext change,
(t)op,


Unnamed: 0,Original,Original.1,Unnamed: 3,HTML Parsed,HTML Parsed.1
n,1,"<Alert variant=""primary"">",n,,
,2,<Alert.Heading>,,,
,3,"By the end of this section, you will be able to:",,1,"By the end of this section, you will be able to:"
n,4,</Alert.Heading>,n,,
,5,- Understand how fiscal policy and monetary policy are interconnected - Explain,,2,- Understand how fiscal policy and monetary policy are interconnected - Explain
,6,the three lag times that often occur when solving economic problems - Identify,,3,the three lag times that often occur when solving economic problems - Identify
,7,the legal and political challenges of responding to an economic problem,,4,the legal and political challenges of responding to an economic problem
t,8,</Alert>,t,,
,9,"In the early 1960s, many leading economists believed that the problem of the business cycle, and the",,5,"In the early 1960s, many leading economists believed that the problem of the business cycle, and the"
,>,"swings between cyclical unemployment and inflation, were a thing of the past. On the cover of its D",,>,"swings between cyclical unemployment and inflation, were a thing of the past. On the cover of its D"

Legends,Legends.1
Colors Added Changed Deleted,Links (f)irst change (n)ext change (t)op

Colors
Added
Changed
Deleted

Links,Links.1
(f)irst change,
(n)ext change,
(t)op,


Unnamed: 0,Original,Original.1,Unnamed: 3,HTML Parsed,HTML Parsed.1
t,1,"India was formally under British rule from 1858 to 1947. During that time, India consistently had tr",t,1,"India was formally under British rule from 1858 to 1947. During that time, India consistently had tr"
,>,ade surpluses with Great Britain. Anyone who believes that trade surpluses are a sign of economic st,,>,ade surpluses with Great Britain. Anyone who believes that trade surpluses are a sign of economic st
,>,rength and dominance while trade deficits are a sign of economic weakness must find this pattern odd,,>,rength and dominance while trade deficits are a sign of economic weakness must find this pattern odd
,>,", since it would mean that colonial India was successfully dominating and exploiting Great Britain f",,>,", since it would mean that colonial India was successfully dominating and exploiting Great Britain f"
,>,or almost a century—which was not true.,,>,or almost a century—which was not true.
,2,"Instead, India's trade surpluses with Great Britain meant that each year there was an overall flow o",,2,"Instead, India's trade surpluses with Great Britain meant that each year there was an overall flow o"
,>,"f financial capital from India to Great Britain. In India, many heavily criticized this financial ca",,>,"f financial capital from India to Great Britain. In India, many heavily criticized this financial ca"
,>,"pital flow as the “drain,” and they viewed eliminating the financial capital drain as one of the man",,>,"pital flow as the “drain,” and they viewed eliminating the financial capital drain as one of the man"
,>,y reasons why India would benefit from achieving independence.,,>,y reasons why India would benefit from achieving independence.

Legends,Legends.1
Colors Added Changed Deleted,Links (f)irst change (n)ext change (t)op

Colors
Added
Changed
Deleted

Links,Links.1
(f)irst change,
(n)ext change,
(t)op,


Unnamed: 0,Original,Original.1,Unnamed: 3,HTML Parsed,HTML Parsed.1
f,1,"If you were born within the last three decades in the United States, Canada, or many other countries",f,1,"If you were born within the last three decades in the United States, Canada, or many other countries"
,>,"in the developed world, you probably have no real experience with a high rate of inflation. Inflati",,>,"in the developed world, you probably have no real experience with a high rate of inflation. Inflati"
,>,"on is when most prices in an entire economy are rising. However, there is an extreme form of inflati",,>,"on is when most prices in an entire economy are rising. However, there is an extreme form of inflati"
,>,"on called hyperinflation. This occurred in Germany between 1921 and 1928, and more recently in Zimba",,>,"on called hyperinflation. This occurred in Germany between 1921 and 1928, and more recently in Zimba"
,>,"bwe between 2008 and 2009. In November 2008, Zimbabwe had an inflation rate of 79.6 billion percent.",,>,"bwe between 2008 and 2009. In November 2008, Zimbabwe had an inflation rate of 79.6 billion percent."
,>,"In contrast, in 2014, the United States had an average annual rate of inflation of 1.6%.",,>,"In contrast, in 2014, the United States had an average annual rate of inflation of 1.6%."
n,2,"Zimbabwe's inflation rate was so high it is difficult to comprehend, so let's put it into context. I",n,2,"Zimbabwe's inflation rate was so high it is difficult to comprehend, so let's put it into context. I"
,>,"t is equivalent to price increases of 98% per day. This means that, from one day to the next, prices",,>,"t is equivalent to price increases of 98% per day. This means that, from one day to the next, prices"
,>,essentially double. What is life like in an economy afflicted with hyperinflation? Most of you read,,>,essentially double. What is life like in an economy afflicted with hyperinflation? Most of you read
,>,ing this will have never experienced this phenomenon. The government adjusted prices for commodities,,>,ing this will have never experienced this phenomenon. The government adjusted prices for commodities

Legends,Legends.1
Colors Added Changed Deleted,Links (f)irst change (n)ext change (t)op

Colors
Added
Changed
Deleted

Links,Links.1
(f)irst change,
(n)ext change,
(t)op,


Unnamed: 0,Original,Original.1,Unnamed: 3,HTML Parsed,HTML Parsed.1
n,1,In [Welcome to Economics!](1-introduction) we learned that every society faces the problem of scarci,n,1,"In Welcome to Economics! we learned that every society faces the problem of scarcity, where limited"
,>,"ty, where limited resources conflict with unlimited needs and wants. The production possibilities cu",,>,resources conflict with unlimited needs and wants. The production possibilities curve illustrates th
,>,rve illustrates the choices involved in this dilemma.,,>,e choices involved in this dilemma.
,2,Every economy faces two situations in which it may be able to expand consumption of all goods:,,2,Every economy faces two situations in which it may be able to expand consumption of all goods:
n,3,"1. A society may discover that it has been using its resources inefficiently, in which case by impro",n,3,"A society may discover that it has been using its resources inefficiently, in which case by improvin"
,>,"ving efficiency and producing on the production possibilities frontier, it can have more of all good",,>,"g efficiency and producing on the production possibilities frontier, it can have more of all goods ("
,>,s (or at least more of some and less of none).,,>,or at least more of some and less of none).
,4,"2. As resources grow over a period of years (e.g., more labor and more capital), the economy grows.",,4,"As resources grow over a period of years (e.g., more labor and more capital), the economy grows. As"
,>,"As it does, the production possibilities frontier for a society will tend to shift outward and socie",,>,"it does, the production possibilities frontier for a society will tend to shift outward and society"
,>,ty will be able to afford more of all goods.,,>,will be able to afford more of all goods.

Legends,Legends.1
Colors Added Changed Deleted,Links (f)irst change (n)ext change (t)op

Colors
Added
Changed
Deleted

Links,Links.1
(f)irst change,
(n)ext change,
(t)op,


In [38]:
df['clean_text'] = df.raw_text.apply(clean_raw_text)

## Generate Slugs

In [39]:
def make_slug_github(group):
    slugger = GithubSlugger()
    group['slug'] = group['heading'].apply(slugger.slug)
    return group

In [40]:
df = df.groupby(['module', 'chapter', 'section'], group_keys=False).apply(make_slug_github)

## Save

In [41]:
df.to_csv('../data/subsections.csv', index=False)