# Import and set variables

1. Import all of the modules we need
2. Set a few variables for later

In [330]:
import os
import csv
from marko import parse, render
from bs4 import BeautifulSoup
import numpy as np
from datetime import datetime

md_folder = "markdown_files"
timestamp = datetime.strftime(datetime.now(), "%Y%m%d-%H%M%S")
report_name = f"{md_folder}-{timestamp}.csv"

## Normalize Text Function

In [331]:
## Takes in an HTML formatted string and returns inner text
## Also strips newline characters and strips extra whitespace
def normalize_text(html_string):
    soup = BeautifulSoup(html_string)
    text = soup.get_text()
    text = text.strip().replace("\n", ", ")
    return text


## Create File Row

1. Open markdown file and parse markdown elements with the `marko` module.
2. Find indices of all the headings in the markdown elements array.
3. Group headings and their following content into arrays based those indicies.
4. Iterate through this array of arrays to create a dictionary that will be understood by `csv.DictWriter()`.
5. Return the row and a list of column names.

In [332]:

def create_file_row(filepath):
    with open(filepath, mode='r', encoding='utf-8') as txtfile:
        lines = txtfile.read()
        p = parse(lines)
        
    indices = []
    for idx, child in enumerate(p.children):
        if child.get_type() == "Heading":
            indices.append(idx)

    splits = []
    for split in np.split(p.children, indices):
        if len(split):
            splits.append(split)

    row = {"filename": [os.path.basename(filepath)]}
    
    for elems in splits:
        heading = ""
        for e in elems:
            ## Add heading value row as a key if it isn't there already
            if e.get_type() == "Heading":
                heading = normalize_text(render(e))
                level = '#' * e.level
                heading = f"{level} {heading}"
                if heading not in row.keys():
                    row[heading] = []
            ## If the element type isn't blank, add value to row under heading
            elif e.get_type() != "BlankLine":
                s = normalize_text(render(e))
                row[heading].append(s)
    ## If there is more than one value for header in this row, join with semicolon
    row = {k:"; ".join(v) for k,v in row.items()}
    return [row, list(row.keys())]



## Iterate through folder

1. Create blank arrays for column names and rows.
2. Iterate through folder defined at `md_folder`.
3. If the file is `*.md`, pass the filepath to `create_file_row()`.
4. Add resulting row to rows array.
5. Add new columns to columns array.

In [333]:

columns = []
rows = []

for file in os.scandir(md_folder):
    fn, ext = os.path.splitext(file.name)
    if ext == ".md":
        row_arr = create_file_row(file.path)
        rows.append(row_arr[0])
        for column in row_arr[1]:
            if column not in columns:
                columns.append(column)

## Write rows to file

In [334]:
with open(report_name, mode='w', encoding='utf-8') as csvfile:
    csvwriter = csv.DictWriter(csvfile, fieldnames=columns)
    csvwriter.writeheader()
    csvwriter.writerows(rows)
    
print(report_name)

markdown_files-20230525-221252.csv
