In [1]:
!pip install markdown
!pip install difflib
!pip install github_slugger

import re
import pandas as pd
from pathlib import Path
import string

import markdown
from bs4 import BeautifulSoup
import string

import difflib
differ = difflib.HtmlDiff(wrapcolumn=100)
from IPython.display import display, HTML

from github_slugger import GithubSlugger

[31mERROR: Could not find a version that satisfies the requirement difflib (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for difflib[0m[31m


In [2]:
CONTENTS = Path('/home/jovyan/active-projects/macro-economics-textbook/contents/')
# Get github slugger

## Get Markdown and Infer Metadata

We infer metadata from the directory structure.  
`PosixPath('../contents/module-3/index.mdx')` Each module has a root-level index page  
`PosixPath('../contents/module-3/chapter-9/index.mdx')`  Each chapter has a root-level index page  
`PosixPath('../contents/module-3/chapter-9/section-1/index.mdx')`  Each section has a content page, also called index.mdx  


In [3]:
mdx_sections = []

verbose = False
for path in sorted(CONTENTS.glob('**/index.mdx')):
    rel_path = path.relative_to('/home/jovyan/active-projects/macro-economics-textbook/contents/')
    parents = reversed(rel_path.parents[:-1]) # get all parent directories before '../contents'. We omit ([:-1]) the top-level relative directory '.'
    module = next(parents).name.split('-')[1] # all .mdx files belong to a module
    chapter = next(parents, Path('-0')).name.split('-')[1] # if the iterator is exhausted, this will make the chapter number 0
    section = next(parents, Path('-0')).name.split('-')[1] # if the iterator is exhausted, this will make the section number 0
    if verbose:
        print(f'{path.as_posix():<60}Module {module}, Chapter {chapter}, Section {section}')
    mdx_sections.append({
        'module': module,
        'chapter': chapter,
        'section': section,
        'path': path,
    })

mdx_sections[:5]

[{'module': '1',
  'chapter': '1',
  'section': '0',
  'path': PosixPath('/home/jovyan/active-projects/macro-economics-textbook/contents/module-1/chapter-1/index.mdx')},
 {'module': '1',
  'chapter': '1',
  'section': '1',
  'path': PosixPath('/home/jovyan/active-projects/macro-economics-textbook/contents/module-1/chapter-1/section-1/index.mdx')},
 {'module': '1',
  'chapter': '1',
  'section': '2',
  'path': PosixPath('/home/jovyan/active-projects/macro-economics-textbook/contents/module-1/chapter-1/section-2/index.mdx')},
 {'module': '1',
  'chapter': '1',
  'section': '3',
  'path': PosixPath('/home/jovyan/active-projects/macro-economics-textbook/contents/module-1/chapter-1/section-3/index.mdx')},
 {'module': '1',
  'chapter': '1',
  'section': '4',
  'path': PosixPath('/home/jovyan/active-projects/macro-economics-textbook/contents/module-1/chapter-1/section-4/index.mdx')}]

## Parse with regex

We want to capture all content that is between two top-level headings.

We also want to exclude certain subsections that have little or no text content.

I hate regex so much. Let's break down the pattern below:  
1. `(?:^#{1,2} )` A non-capturing group that looks for a line that starts with 1 or 2 '#', and ensures that the next character is a space ' '
2. `(.*?)$` A capturing group that includes all characters until the end of the line. This is our subsection heading.
3. `\s*` matches on any any whitespace (optionally)
4. `(.*?)` A capturing group that includes all characters. This is the subsection text.
5. `(?=\s*^#|\Z)` A negative lookahead that tells us when to stop capturing subsection text. It will stop when it finds another subsection heading or the end of the document. It will include all the whitespace preceding one of these terminating elements, preventing that from being included in the subsection text capture group.

`re.DOTALL` allows the '.' character to match on newlines.  
`re.MULTILINE` makes the '^' and '$' anchors match on the beginning/end of lines instead of the beginning/end of the document. We use '\Z' to match the end of the document in multiline mode.


In [4]:
pattern = re.compile(r'(?:^#{1,2} )(.*?)$\s*(.*?)(?=\s*^#|\Z)', re.DOTALL | re.MULTILINE)

# all these are lowercased because capitalization is inconsistent across MDX files
subsections_to_skip = [
    'learn with videos',
    'please write your summary below',
    'please your write summary below', # ... a perfect example of why this approach is doomed. More than 10 sections have this typo.
    'bring it home',
    'clear it up',
    'work it out',
]

subsections = []

verbose = True
for section in mdx_sections:
    text = section['path'].read_text()
    matches = pattern.findall(text)
    for i, match in enumerate(matches):
        subsection_title = match[0]
        subsection_text = match[1]
        if subsection_title.lower().strip() in subsections_to_skip: # lowercase() and strip() because capitalization and spacing are inconsistent
            if verbose and len(subsection_text) > 10: # if verbose, print the longer sections that we will be EXcluding
                print('-'*80)
                print(subsection_title, '\n', subsection_text)
                print('_'*80)                
            continue
        elif verbose and len(subsection_text) < 100: # if verbose, print the shorter sections that we will still be INcluding
            print('-'*80)
            print(subsection_title, '\n', subsection_text)
            print('_'*80)
        else:
            subsection_dict = {
                **section, # add in section-level metadata
                'subsection': i,
                'heading': subsection_title,
                'raw_text': subsection_text,
            }
            subsection_dict.pop('path')
            subsections.append(subsection_dict)

--------------------------------------------------------------------------------
Learn with Videos 
 <iframe
  width="560"
  height="315"
  src="https://www.youtube.com/embed/dKY2JkfPox4"
  title="YouTube video player"
  frameBorder="0"
  allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
  allowFullScreen
  enablejsapi="1"
></iframe>

import Card from "react-bootstrap/Card";

<Card className="bg-dark text-white" style = {{ width: "35rem"}}>
  <Card.Img
    src="/c1/XOpwg2bIP7zun32w_TWN0KUbcWDqhv-8t-stock-image.jpeg"
    alt="Card image"
    style = {{ height: '25rem', width: "35rem"}}
  />
  <Card.ImgOverlay className = "d-flex align-items-center">
    <Card.Text style={{fontSize: "2em", color:'yellow'}} >
      <span>How 10 Western Cities Are Dealing with Water Scarcity and Drought</span>
      <br/>
      <span>[Read More](https://stateimpact.npr.org/texas/2013/08/02/how-10-western-cities-are-dealing-with-water-scarcity-and-drought/)</sp

In [5]:
df = pd.DataFrame(subsections)
df

Unnamed: 0,module,chapter,section,subsection,heading,raw_text
0,1,1,0,0,Decisions ... Decisions in the Social Media Age,Every day we are faced with a myriad of decisi...
1,1,1,0,1,Introduction,What is economics and why should you spend you...
2,1,1,1,0,Overview,"import Alert from ""react-bootstrap/Alert"";\nim..."
3,1,1,1,1,Introduction to FRED,Data is very important in economics because it...
4,1,1,1,2,The Problem of Scarcity,"Think about all the things you consume: food, ..."
...,...,...,...,...,...,...
512,5,20,4,2,The World Trade Organization,The World Trade Organization (WTO) was officia...
513,5,20,4,3,Regional Trading Agreements,There are different types of economic integrat...
514,5,20,4,5,Trade Policy at the National Level,"Yet another dimension of trade policy, along w..."
515,5,20,4,6,Long-Term Trends in Barriers to Trade,"In newspaper headlines, trade policy appears m..."


## HTML Cleanup

First step is to clean up the markdown files into something closer to standard markdown.

RegEx is used to delete tables and their contents. Without the HTML tags, tables become a mangled list of strings.  
We also use RegEx to remove the javascript import statements and convert links to standard markdown formatting.

Next, we convert the markdown to HTML and use BeautifulSoup to extract the text.

Debugging and testing is done with difflib.

In [6]:
def diff(text, clean_text):
    text_lines = [s.strip() for s in text.splitlines() if s.strip()] # delete empty lines
    clean_text_lines = [s.strip() for s in clean_text.splitlines() if s.strip()] # delete empty lines
    diff = differ.make_file(text_lines, clean_text_lines, fromdesc='Original', todesc='HTML Parsed', context=False, numlines=0)
    display(HTML(diff))

In [7]:
exclamation_links_pattern = re.compile(r'!\[') # Removes the exclamation point in links ![link_text](url) --> [link_text](url)
html_table_pattern = re.compile(r'<Table.*?</Table>\s*', re.DOTALL | re.IGNORECASE) # remove table HTML and its contents
javascript_import_pattern = re.compile(r'^import.*?;', re.MULTILINE) # remove javascript imports

def clean_md(text_md):
    text_md = exclamation_links_pattern.sub(r'[', text_md)
    text_md = html_table_pattern.sub('', text_md)
    text_md = javascript_import_pattern.sub('', text_md)
    return text_md.strip()

In [8]:
def clean_raw_text(subsection_text_mdx):
    subsection_text_md = clean_md(subsection_text_mdx)
    subsection_text_html = markdown.markdown(subsection_text_md, extensions=['extra', 'tables'])
    subsection_text_html = html_table_pattern.sub('', subsection_text_html) # this gets any markdown tables that have now been converted to HTML
    return BeautifulSoup(subsection_text_html, features='html.parser').get_text().strip()

In [9]:
for text in df.raw_text.sample(5):
    clean_text = clean_raw_text(text)
    diff(text, clean_text)

Unnamed: 0,Original,Original.1,Unnamed: 3,HTML Parsed,HTML Parsed.1
f,1,We defined demand as the amount of some product a consumer is willing and able to purchase at each p,f,1,We defined demand as the amount of some product a consumer is willing and able to purchase at each p
,>,rice. That suggests at least two factors in addition to price that affect demand:,,>,rice. That suggests at least two factors in addition to price that affect demand:
t,2,"1. **Willingness to buy** suggests a desire, based on what economists call **tastes and preferences*",t,2,"Willingness to buy suggests a desire, based on what economists call tastes and preferences. If you n"
,>,"*. If you neither need nor want something, you will not buy it.",,>,"either need nor want something, you will not buy it."
,3,2. Ability to purchase suggests that **income** is important. Professors are usually able to afford,,3,Ability to purchase suggests that income is important. Professors are usually able to afford better
,>,"better housing and transportation than students, because they have more income.",,>,"housing and transportation than students, because they have more income."
,4,"3. **Prices of related goods** can affect demand also. If you need a new car, the price of a Honda m",,4,"Prices of related goods can affect demand also. If you need a new car, the price of a Honda may affe"
,>,ay affect your demand for a Ford.,,>,ct your demand for a Ford.
,5,"4. The **size or composition of the population** can affect demand. The more children a family has,",,5,"The size or composition of the population can affect demand. The more children a family has, the gre"
,>,"the greater their demand for clothing. The more driving-age children a family has, the greater their",,>,"ater their demand for clothing. The more driving-age children a family has, the greater their demand"

Legends,Legends.1
Colors Added Changed Deleted,Links (f)irst change (n)ext change (t)op

Colors
Added
Changed
Deleted

Links,Links.1
(f)irst change,
(n)ext change,
(t)op,


Unnamed: 0,Original,Original.1,Unnamed: 3,HTML Parsed,HTML Parsed.1
t,1,We can tell the story of modern economic growth by looking at calorie consumption over time. The dra,t,1,We can tell the story of modern economic growth by looking at calorie consumption over time. The dra
,>,matic rise in incomes allowed the average person to eat better and consume more calories. How did th,,>,matic rise in incomes allowed the average person to eat better and consume more calories. How did th
,>,ese incomes increase? The neoclassical growth consensus uses the aggregate production function to su,,>,ese incomes increase? The neoclassical growth consensus uses the aggregate production function to su
,>,ggest that the period of modern economic growth came about because of increases in inputs such as te,,>,ggest that the period of modern economic growth came about because of increases in inputs such as te
,>,chnology and physical and human capital. Also important was the way in which technological progress,,>,chnology and physical and human capital. Also important was the way in which technological progress
,>,combined with physical and human capital deepening to create growth and convergence. The issue of di,,>,combined with physical and human capital deepening to create growth and convergence. The issue of di
,>,"stribution of income notwithstanding, it is clear that the average worker can afford more calories i",,>,"stribution of income notwithstanding, it is clear that the average worker can afford more calories i"
,>,n 2017 than in 1875.,,>,n 2017 than in 1875.
,2,"Aside from increases in income, there is another reason why the average person can afford more food.",,2,"Aside from increases in income, there is another reason why the average person can afford more food."
,>,Modern agriculture has allowed many countries to produce more food than they need. Despite having m,,>,Modern agriculture has allowed many countries to produce more food than they need. Despite having m

Legends,Legends.1
Colors Added Changed Deleted,Links (f)irst change (n)ext change (t)op

Colors
Added
Changed
Deleted

Links,Links.1
(f)irst change,
(n)ext change,
(t)op,


Unnamed: 0,Original,Original.1,Unnamed: 3,HTML Parsed,HTML Parsed.1
f,1,"Over decades and generations, seemingly small differences of a few percentage points in the annual r",f,1,"Over decades and generations, seemingly small differences of a few percentage points in the annual r"
,>,"ate of economic growth make an enormous difference in GDP per capita. In this module, we discuss som",,>,"ate of economic growth make an enormous difference in GDP per capita. In this module, we discuss som"
,>,"e of the components of economic growth, including physical capital, human capital, and technology.",,>,"e of the components of economic growth, including physical capital, human capital, and technology."
n,2,The category of **physical capital** includes the plant and equipment that firms use as well as thin,n,2,The category of physical capital includes the plant and equipment that firms use as well as things l
,>,"gs like roads (also called **infrastructure**). Again, greater physical capital implies more output.",,>,"ike roads (also called infrastructure). Again, greater physical capital implies more output. Physica"
,>,Physical capital can affect productivity in two ways:,,>,l capital can affect productivity in two ways:
,3,"**1. An increase in the quantity of physical capital (for example, more computers of the same qualit",,3,"1. An increase in the quantity of physical capital (for example, more computers of the same quality)"
,>,y)**,,,
,4,**2. An increase in the quality of physical capital (same number of computers but the computers are,,4,2. An increase in the quality of physical capital (same number of computers but the computers are fa
,>,"faster, and so on).**",,>,"ster, and so on)."

Legends,Legends.1
Colors Added Changed Deleted,Links (f)irst change (n)ext change (t)op

Colors
Added
Changed
Deleted

Links,Links.1
(f)irst change,
(n)ext change,
(t)op,


Unnamed: 0,Original,Original.1,Unnamed: 3,HTML Parsed,HTML Parsed.1
n,1,"<Alert variant=""primary"">",n,,
,2,<Alert.Heading>,,,
,3,"By the end of this section, you will be able to:",,1,"By the end of this section, you will be able to:"
n,4,</Alert.Heading>,n,,
,5,- Define and contrast nominal GDP and real GDP - Explain GDP deflator - Calculate,,2,- Define and contrast nominal GDP and real GDP - Explain GDP deflator - Calculate
,6,real GDP based on nominal GDP values,,3,real GDP based on nominal GDP values
t,7,</Alert>,t,,
,8,"When examining economic statistics, there is a crucial distinction worth emphasizing. The distinctio",,4,"When examining economic statistics, there is a crucial distinction worth emphasizing. The distinctio"
,>,"n is between nominal and real measurements, which refer to whether or not **inflation** has distorte",,>,"n is between nominal and real measurements, which refer to whether or not inflation has distorted a"
,>,d a given statistic. Looking at economic statistics without considering inflation is like looking th,,>,given statistic. Looking at economic statistics without considering inflation is like looking throug

Legends,Legends.1
Colors Added Changed Deleted,Links (f)irst change (n)ext change (t)op

Colors
Added
Changed
Deleted

Links,Links.1
(f)irst change,
(n)ext change,
(t)op,


Unnamed: 0,Original,Original.1,Unnamed: 3,HTML Parsed,HTML Parsed.1
f,1,The national saving and investment identity also provides a framework for thinking about what will c,f,1,The national saving and investment identity also provides a framework for thinking about what will c
,>,ause trade deficits to rise or fall. Begin with the version of the identity that has domestic saving,,>,ause trade deficits to rise or fall. Begin with the version of the identity that has domestic saving
,>,s and investment on the left and the trade deficit on the right:,,>,s and investment on the left and the trade deficit on the right:
n,2,<br />,n,,
,3,<center>,,,
,4,"<font size=""4"" face=""serif"">",,,
,5,Domestic investment – Private domestic savings – Public domestic savings = Trade deficit,,2,Domestic investment – Private domestic savings – Public domestic savings = Trade deficit
n,6,</font>,n,,
,7,</center>,,,
,8,<center>,,,

Legends,Legends.1
Colors Added Changed Deleted,Links (f)irst change (n)ext change (t)op

Colors
Added
Changed
Deleted

Links,Links.1
(f)irst change,
(n)ext change,
(t)op,


In [10]:
df['clean_text'] = df.raw_text.apply(clean_raw_text)

## Generate Slugs

In [11]:
heading_list = ['Overview', "Say's Law and the Macroeconomics of Supply", "Keynes' Law and the Macroeconomics of Demand", 'Please write your summary below', 'Overview', 'Clear It Up', 'How Changes by Consumers and Firms Can Affect AD', 'How Government Macroeconomic Policy Choices Can Shift AD', 'Clear It Up', 'Learn with Videos', 'Please write your summary below', 'Overview']
remove_these = string.punctuation + '—”“'

def generate_slugs(heading_list):
    slug_list1 = [heading.lower().replace('-', ' ').translate(str.maketrans('', '', remove_these)).replace(' ', '-') for heading in heading_list] 
    slug_list = []
    i = 0
    for word in slug_list1:
        if slug_list1[:i].count(word) < 1:
            slug_list.append(word)
        else:
            slug_list.append(word+'-'+str(slug_list1[:i].count(word)))       
        i += 1
    return slug_list

def make_slug_list(df):
    slug_list = []
    for chapter in df['chapter'].drop_duplicates():
        df1 = df[df['chapter']==chapter]
        for section in df1['section'].drop_duplicates():
            df2 = df1[df1['section']==section]
            slug_list.append(generate_slugs(list(df2['heading'])))
    return sum(slug_list,[])

In [12]:
df['slug'] = make_slug_list(df)

In [19]:
df['id'] = df.apply(lambda row: str(row['chapter']) + '-' + str(row['section']) + '-' + str(row['subsection']) + '-' + row['slug'], axis=1)
df = df[['id', 'module', 'chapter', 'section', 'subsection', 'heading', 'raw_text', 'clean_text', 'slug']]
df

Unnamed: 0,id,module,chapter,section,subsection,heading,raw_text,clean_text,slug
0,1-0-0-decisions--decisions-in-the-social-media...,1,1,0,0,Decisions ... Decisions in the Social Media Age,Every day we are faced with a myriad of decisi...,Every day we are faced with a myriad of decisi...,decisions--decisions-in-the-social-media-age
1,1-0-1-introduction,1,1,0,1,Introduction,What is economics and why should you spend you...,What is economics and why should you spend you...,introduction
2,1-1-0-overview,1,1,1,0,Overview,"import Alert from ""react-bootstrap/Alert"";\nim...","By the end of this section, you will be able t...",overview
3,1-1-1-introduction-to-fred,1,1,1,1,Introduction to FRED,Data is very important in economics because it...,Data is very important in economics because it...,introduction-to-fred
4,1-1-2-the-problem-of-scarcity,1,1,1,2,The Problem of Scarcity,"Think about all the things you consume: food, ...","Think about all the things you consume: food, ...",the-problem-of-scarcity
...,...,...,...,...,...,...,...,...,...
512,20-4-2-the-world-trade-organization,5,20,4,2,The World Trade Organization,The World Trade Organization (WTO) was officia...,The World Trade Organization (WTO) was officia...,the-world-trade-organization
513,20-4-3-regional-trading-agreements,5,20,4,3,Regional Trading Agreements,There are different types of economic integrat...,There are different types of economic integrat...,regional-trading-agreements
514,20-4-5-trade-policy-at-the-national-level,5,20,4,5,Trade Policy at the National Level,"Yet another dimension of trade policy, along w...","Yet another dimension of trade policy, along w...",trade-policy-at-the-national-level
515,20-4-6-long-term-trends-in-barriers-to-trade,5,20,4,6,Long-Term Trends in Barriers to Trade,"In newspaper headlines, trade policy appears m...","In newspaper headlines, trade policy appears m...",long-term-trends-in-barriers-to-trade


## Save

In [None]:
df.to_csv('../data/subsections.csv', index=False)