In [1]:
import re
import pandas as pd
from pathlib import Path
import string

import markdown
from bs4 import BeautifulSoup
import string

In [2]:
PATH_TO_CONTENTS = '/home/jovyan/active-projects/macro-economics-textbook/contents/'
verbose = False
subsections_to_skip = []
# [
#     'please write your summary below',
#     'please your write summary below', # ... a perfect example of why this approach is doomed. More than 10 sections have this typo.
#     'bring it home',
#     'clear it up',
#     'work it out',
# ]

In [44]:
# Get markdown and infer metadata
CONTENTS = Path(PATH_TO_CONTENTS)

mdx_sections = []

for path in sorted(CONTENTS.glob('**/index.mdx')):
    rel_path = path.relative_to('/home/jovyan/active-projects/macro-economics-textbook/contents/')
    parents = reversed(rel_path.parents[:-1]) # get all parent directories before '../contents'. We omit ([:-1]) the top-level relative directory '.'
    module = next(parents).name.split('-')[1] # all .mdx files belong to a module
    chapter = next(parents, Path('-0')).name.split('-')[1] # if the iterator is exhausted, this will make the chapter number 0
    section = next(parents, Path('-0')).name.split('-')[1] # if the iterator is exhausted, this will make the section number 0
    if verbose:
        print(f'{path.as_posix():<60}Module {module}, Chapter {chapter}, Section {section}')
    mdx_sections.append({
        'module': module,
        'chapter': chapter,
        'section': section,
        'path': path,
    })

    
# Parse with Regex    
## all these are lowercased because capitalization is inconsistent across MDX files
pattern = re.compile(r'(?:^#{1,2} )(.*?)$\s*(.*?)(?=\s*^#|\Z)', re.DOTALL | re.MULTILINE)

subsections = []

for section in mdx_sections:
    text = section['path'].read_text()
    matches = pattern.findall(text)
    for i, match in enumerate(matches):
        subsection_title = match[0]
        subsection_text = match[1]
        if subsection_title.lower().strip() in subsections_to_skip: # lowercase() and strip() because capitalization and spacing are inconsistent
            if verbose and len(subsection_text) > 10: # if verbose, print the longer sections that we will be EXcluding
                print('-'*80)
                print(subsection_title, '\n', subsection_text)
                print('_'*80)                
            continue
        elif verbose and len(subsection_text) < 100: # if verbose, print the shorter sections that we will still be INcluding
            print('-'*80)
            print(subsection_title, '\n', subsection_text)
            print('_'*80)
        else:
            subsection_dict = {
                **section, # add in section-level metadata
                'subsection': i,
                'heading': subsection_title,
                'raw_text': subsection_text,
            }
            subsection_dict.pop('path')
            subsections.append(subsection_dict)

df = pd.DataFrame(subsections)

# HTML cleanup
def diff(text, clean_text):
    text_lines = [s.strip() for s in text.splitlines() if s.strip()] # delete empty lines
    clean_text_lines = [s.strip() for s in clean_text.splitlines() if s.strip()] # delete empty lines
    diff = differ.make_file(text_lines, clean_text_lines, fromdesc='Original', todesc='HTML Parsed', context=False, numlines=0)

exclamation_links_pattern = re.compile(r'!\[') # Removes the exclamation point in links ![link_text](url) --> [link_text](url)
html_table_pattern = re.compile(r'<Table.*?</Table>\s*', re.DOTALL | re.IGNORECASE) # remove table HTML and its contents
javascript_import_pattern = re.compile(r'^import.*?;', re.MULTILINE) # remove javascript imports

def clean_md(text_md):
    text_md = exclamation_links_pattern.sub(r'[', text_md)
    text_md = html_table_pattern.sub('', text_md)
    text_md = javascript_import_pattern.sub('', text_md)
    return text_md.strip()

def clean_raw_text(subsection_text_mdx):
    subsection_text_md = clean_md(subsection_text_mdx)
    subsection_text_html = markdown.markdown(subsection_text_md, extensions=['extra', 'tables'])
    subsection_text_html = html_table_pattern.sub('', subsection_text_html) # this gets any markdown tables that have now been converted to HTML
    return BeautifulSoup(subsection_text_html, features='html.parser').get_text().strip()




from youtube_transcript_api import YouTubeTranscriptApi

def get_transcript(raw_text, timestamps=False):
    raw_text = video_df.iloc[0]['raw_text']
    video_code = re.findall('https://www.youtube.com.+', raw_text)[0][:-1].split('/')[-1].split('=')[-1]
    srt = YouTubeTranscriptApi.get_transcript(video_code)
    if timestamps == False:
        transcript = ' '.join([x['text'] for x in srt])
    else:
        transcript=srt
    return transcript

df['clean_text'] = df.apply(lambda row: get_transcript(row['raw_text']) if (row['heading'].lower() == 'learn with videos') else clean_raw_text(row['raw_text']), axis=1)


# Generate slugs
remove_these = string.punctuation + '—”“'

def generate_slugs(heading_list):
    slug_list1 = [heading.lower().replace('-', ' ').translate(str.maketrans('', '', remove_these)).replace(' ', '-') for heading in heading_list] 
    slug_list = []
    i = 0
    for word in slug_list1:
        if slug_list1[:i].count(word) < 1:
            slug_list.append(word)
        else:
            slug_list.append(word+'-'+str(slug_list1[:i].count(word)))       
        i += 1
    return slug_list

def make_slug_list(df):
    slug_list = []
    for chapter in df['chapter'].drop_duplicates():
        df1 = df[df['chapter']==chapter]
        for section in df1['section'].drop_duplicates():
            df2 = df1[df1['section']==section]
            slug_list.append(generate_slugs(list(df2['heading'])))
    return sum(slug_list,[])

df['slug'] = make_slug_list(df)

# Add unique id for each subsection
df['id'] = df.apply(lambda row: str(row['chapter']) + '-' + 
                    str(row['section']) + '-' + str(row['subsection']) + '-' + row['slug'], axis=1)
df = df[['id', 'module', 'chapter', 'section', 'subsection', 'heading', 'raw_text', 'clean_text', 'slug']]
df[['module', 'chapter', 'section', 'subsection']] = df[['module', 'chapter', 'section', 'subsection']].astype(int)
df.to_csv('subsections.csv', index=False)

In [4]:
'''
This part confirms that every subsection slug generated has a corresponding slug from the markdown interpreter.
'''

reference_df = pd.read_csv('../data/all_slugs.csv')

# df['chapter'] = df['chapter'].astype(int)
# df['section'] = df['section'].astype(int)

for chapter in reference_df['chapter'].drop_duplicates():
    ref = reference_df[reference_df['chapter'] == chapter]
    main = df[df['chapter'] == chapter]

    for section in ref['section'].drop_duplicates():
        ref_list = list(ref[ref['section'] == section]['gatsby'])
        main_list = list(main[main['section'] == section]['slug'])
        
        for i in ref_list:
            if i not in main_list:
                print(chapter, section, i)

In [7]:
video_df = df[df['heading'].str.lower() == 'learn with videos']

In [43]:
df

Unnamed: 0,id,module,chapter,section,subsection,heading,raw_text,clean_text,slug
0,1-0-0-decisions--decisions-in-the-social-media...,1,1,0,0,Decisions ... Decisions in the Social Media Age,Every day we are faced with a myriad of decisi...,Every day we are faced with a myriad of decisi...,decisions--decisions-in-the-social-media-age
1,1-0-1-introduction,1,1,0,1,Introduction,What is economics and why should you spend you...,What is economics and why should you spend you...,introduction
2,1-0-2-please-write-your-summary-below,1,1,0,2,Please write your summary below,,,please-write-your-summary-below
3,1-1-0-overview,1,1,1,0,Overview,"import Alert from ""react-bootstrap/Alert"";\nim...","By the end of this section, you will be able t...",overview
4,1-1-1-introduction-to-fred,1,1,1,1,Introduction to FRED,Data is very important in economics because it...,Data is very important in economics because it...,introduction-to-fred
...,...,...,...,...,...,...,...,...,...
836,20-4-6-long-term-trends-in-barriers-to-trade,5,20,4,6,Long-Term Trends in Barriers to Trade,"In newspaper headlines, trade policy appears m...","In newspaper headlines, trade policy appears m...",long-term-trends-in-barriers-to-trade
837,20-4-7-learn-with-videos-2,5,20,4,7,Learn with Videos,"<iframe width=""560"" height=""315"" src=""https://...",hey how you doing students my name is mr. Clif...,learn-with-videos-2
838,20-4-8-please-your-write-summary-below,5,20,4,8,Please your write summary below,,,please-your-write-summary-below
839,20-5-0-overview,5,20,5,0,Overview,"<Alert variant=""primary"">\n <Alert.Heading>\n...","By the end of this section, you will be able t...",overview


In [14]:
raw_text

'<iframe\n  width="560"\n  height="315"\n  src="https://www.youtube.com/embed/dKY2JkfPox4"\n  title="YouTube video player"\n  frameBorder="0"\n  allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"\n  allowFullScreen\n  enablejsapi="1"\n></iframe>\n\nimport Card from "react-bootstrap/Card";\n\n<Card className="bg-dark text-white" style = {{ width: "35rem"}}>\n  <Card.Img\n    src="/c1/XOpwg2bIP7zun32w_TWN0KUbcWDqhv-8t-stock-image.jpeg"\n    alt="Card image"\n    style = {{ height: \'25rem\', width: "35rem"}}\n  />\n  <Card.ImgOverlay className = "d-flex align-items-center">\n    <Card.Text style={{fontSize: "2em", color:\'yellow\'}} >\n      <span>How 10 Western Cities Are Dealing with Water Scarcity and Drought</span>\n      <br/>\n      <span>[Read More](https://stateimpact.npr.org/texas/2013/08/02/how-10-western-cities-are-dealing-with-water-scarcity-and-drought/)</span>\n    </Card.Text>\n\n  </Card.ImgOverlay>\n</Card>\n\n<br />\n<br />'