## HACKED TOGETHER TOOL TO TAKE OLD REVEAL.JS SLIDES WITHOUT SLIDEMACHINE AND MAKE ROUGH MARKDOWN

In [5]:
import re, os, shutil
import urllib.request
import glob

def get_elements(elem,html):
    """
    Get all pairs of 'elem' in an html file.
    """
    
    # Element search patterns
    elem_begin_str = f"<{elem}.*?>"
    elem_begin_pattern = re.compile(elem_begin_str)
    elem_end_str = f"</{elem}>"
    elem_end_pattern = re.compile(elem_end_str)
    
    # Get element beginings
    elems = []
    for m in elem_begin_pattern.finditer(html):
        elems.append((m.start(),"begin",m))

    # Get element ends
    for m in elem_end_pattern.finditer(html):
        elems.append((m.start(),"end",m))

    # Make sure we have the right number of elements -- must be divisible by
    # two. 
    if len(elems) % 2 != 0:
        err = ["mismatched element!"]
        err.append("elements found:")
        for e in elems:
            err.append(f"    {e[2].group(0)}")
        err.append("")
        raise ValueError("\n".join(err))
        
    # Sort elements by their start position
    elems.sort()
    
    # Begins holds all beginning elements we have seen
    begins = []
    
    # Paired holds paired elements
    paired = []
    for e in elems:
        
        # If this is a begining, stick it in begins
        if e[1] == "begin":
            begins.append(e)
            
        # If it is not a beginning, grab the last beginning seen--this is its
        # pair. 
        else:
            this_end = (e[2].group(0),e[2].start(),e[2].end())
            try:
                b = begins.pop(-1)
                this_begin = (b[2].group(0),b[2].start(),b[2].end())
                paired.append((this_begin,this_end))
            except IndexError:
                err = ["elements not properly nested"]
                err.append("current 'begin' elements in the stack:")
                for b in begins:
                    err.append(f"    {b.group(0)}")
                err.append("")
                err.append(f"trying to pair to:\n    {this_end}")
                err = "\n".join(err)
                
                raise ValueError(err)
    
    return paired

def build_attribute_pattern(attr,value):
    """
    Create pattern for finding attr that has value occur at least once 
    in its string.  For example, attr might be 'class' and value might
    be 'container'.  This would match the following:
    
    <div class="container">
    <div class="container ">
    <div class=" container ">
    <div class=" container another_class">
    <div class="yo container another_class">
    
    It would not match the following:
    <div class="container-box">
    <div class="another_class">
    """
    
    pattern_str = f"{attr}=.?[\s\"]{value}[\s\"]"
    return re.compile(pattern_str)

def strip_outermost_tags(html):
    """
    Rip off the outermost html tags, returning attributes as a list.
    """
    
    stripped = "<".join(">".join(html.split(">")[1:]).split("<")[:-1])
    tag = html.split(">")[0].split()[0][1:]
    attributes = " ".join(html.split(">")[0].split(" ")[1:]).strip()
    if attributes != "" and attributes[-1] == "/":
        attributes = attributes[:-1].strip()
    
    return stripped, tag, attributes

def strip_leading_space(html):
    """
    Preserve indentation, except at top level.  For example:
    
            <div>
                <ul>
                    <li>1</li>
                </ul>
            </div>
        
    Becomes:
    <div>
        <ul>
            <li>1</li>
        </ul>
    </div>
    """
    
    html = re.sub("\t","    ",html)
    lines = html.split("\n")
    lines = [l for l in lines if l.strip() != ""]

    if len(lines) == 0:
        return lines
    
    to_whack = min([len(l.split("<")[0]) for l in lines])
    
    out = []
    for l in lines:
        out.append(re.sub(to_whack*" ","",l))
    
    return "\n".join(out)


class HtmlToMarkdown:
    """
    Base class for the HTML to Markdown conversion process.
    
    Must be subclassed to be useful. 
    """
    
    def __init__(self,elem):
        """
        Init function.  Subclasses should call via super().__init__(elem) where
        elem is the html tag (e.g. p, h1, etc.)
        """
        
        self._elem = elem
        self._div_start = re.compile("<div.*?>")
        self._div_end = re.compile("<div.*?>")

        self._single_pattern = None
        
    
    def _find_elements(self,html):
        """
        Find next element to replace, returning protection mask given current 
        html.
        """
    
        # Find protected blocks.  Anything inside of <div> should be treated as
        # raw html in the final markdown. 
        protected = []
        if self._protect_div:
            div_pairs = get_elements("div",html)
            for p in div_pairs:
                protected.append((p[0][1],p[1][1]))        
        
        # Get the elements inside this block of html that have this tag.
        
        # If this is a conventional <elem></elem> bit of html:
        if not self._single_pattern:
        
            try:
                pairs = get_elements(self._elem,html)
            except ValueError as val_err:
                err = f"Error parsing html:\n\n{html}\n\n{val_err}"
                raise ValueError(err)
    
            # If we found something
            if len(pairs) > 0:
                
                # Grab relevant information
                pair = pairs[-1]
                group = pair[0][0]
                start = pair[0][1]
                end = pair[1][2]
                
                return (group,start,end), protected
            
            # If we found nothing, return None
            else:
                return None, None
                
        # If this is something different (ala <img xxx />)
        else:
            
            # Go to last match
            m = None
            for m in self._single_pattern.finditer(html): pass
            
            if m is None:
                return None, None
            
            # Return 
            return (m.group(0),m.start(),m.end()), protected
            
        
    def process(self,html,protect_div=True,strip_right=True):
        """
        Generic processor.  Generally should be usable directly by each subclass.
        
        html: html to process
        protect_div: whether or not to protect html inside <div> and not 
                     convert to <div>
        strip_right: whether or not to strip space from right side of pattern and
                     stick on a newline. 
        """
        
        self._protect_div = protect_div
        
        element, protected = self._find_elements(html)
        
        counter = 1
        while element is not None:
            
            m = element[0]
            start = element[1]
            end = element[2]
            
            before = html[:start]
            inside = html[start:end]
            after = html[end:]
  
            if strip_right: 
                after = after.rstrip()
            
            # If the element is inside a protected block, mangle it's name 
            # so we don't replace in future, and skip.
            skip_match = False
            for p in protected:
                if start > p[0] and end < p[1]:
                    inside = re.sub(self._elem,"__mangled__",inside)
                    skip_match = True
                    html = f"{before}{inside}{after}"
                    break
            
            # If we didn't mangle it, convert to markdown
            if not skip_match:
                inside, tag, attributes = strip_outermost_tags(inside)
                html = self._substitute(before,inside,after,m,counter,attributes)
            
                if strip_right:
                    html = f"{html}\n"
            
            counter += 1
        
            # Find new elements and protection given altered html.
            element, protected = self._find_elements(html)
            
        # Remove any mangled tags used for protecting divs
        if self._protect_div:
            html = re.sub("__mangled__",self._elem,html)
        
        return html
                    
    def _substitute(self,before,inside,after,m,counter,attributes):
        return inside
        
class ParagraphToMarkdown(HtmlToMarkdown):
    """
    Parse a <p> tag.  If <p> has any attributes, these are converted to <span>
    in the markdown. 
    """
    
    def __init__(self):        
        super().__init__(elem="p")
        
    def process(self,html,protect_div=True,right_strip=True):
        return super().process(html,protect_div,right_strip)
        
    def _substitute(self,before,inside,after,m,counter,attributes):
        if attributes is not "":
            inside = f"<span {attributes}>{inside}</span>"
        return f"{before}{inside}\n\n{after}"
                    
class CommentToMarkdown(HtmlToMarkdown):
    """
    Look for html comments and remove whatever is inside them. 
    """
    
    def __init__(self):        
        super().__init__(elem="--")
        self._pattern_str = f"<!--[\S\s]*?-->"
        self._pattern = re.compile(self._pattern_str)
        
    def process(self,html,protect_div=True,right_strip=False):
        return super().process(html,protect_div,right_strip)
        
    def _substitute(self,before,inside,after,m,counter,attributes):
        return f"{before}{after}"
    
class FormatToMarkdownBase(HtmlToMarkdown):
    """
    Base class for formatting tags like <em>x</em> -> *x*.  If <tag> has any
    attributes, these are converted to <span> in the markdown. 
    """
    
    def __init__(self,elem):        
        super().__init__(elem=elem)
        self._fmt_md = "X"
        
    def process(self,html,protect_div=True,right_strip=True):
        return super().process(html,protect_div,right_strip)
        
    def _substitute(self,before,inside,after,m,counter,attributes):
        if attributes is not "":
            inside = f"<span {attributes}>{inside}</span>"
        return f"{before}{self._fmt_md}{inside}{self._fmt_md}{after}\n"
    
class SmallToMarkdown(FormatToMarkdownBase):
    """
    Parse a <small> tag.  The text is wrapped in a custom @stuff@ set of
    markdown tags that can be read by slidemachine. If <small> has any
    attributes, these are converted to <span> in the markdown. 
    """
    
    def __init__(self):        
        super().__init__(elem="small")
        self._fmt_md = "@"  

class EmToMarkdown(FormatToMarkdownBase):
    """
    Parse am <em> tag.  The text is wrapped in a *stuff* set of
    markdown tags that can be read by slidemachine. If <em> has any
    attributes, these are converted to <span> in the markdown. 
    """
    
    def __init__(self):        
        super().__init__(elem="em")
        self._fmt_md = "*"  

class BoldToMarkdown(FormatToMarkdownBase):
    """
    Parse am <b> tag.  The text is wrapped in a **stuff** set of
    markdown tags that can be read by slidemachine. If <b> has any
    attributes, these are converted to <span> in the markdown. 
    """
    
    def __init__(self):        
        super().__init__(elem="b")
        self._fmt_md = "**"  
    
class HeaderToMarkdownBase(HtmlToMarkdown):
    """
    Parse a generic header (h1 to h4).  Most of the time, one should use the
    h1, h2, h3, and h4 subclasses separately. 
    """
    
    def __init__(self,elem="h[1-4]"):
        super().__init__(elem=elem)
        
    def process(self,html,protect_div=True,right_strip=True):
        return super().process(html,protect_div,right_strip)
        
    def _substitute(self,before,inside,after,m,counter,attributes):
        boldness = int(m[2])*"#"        
        return f"{before}\n\n{boldness} {inside}\n{after}\n"

class H1ToMarkdown(HeaderToMarkdownBase):
    """
    Parse <h1> tag and convert to #
    """
    def __init__(self):
        super().__init__(elem="h1")
    
class H2ToMarkdown(HeaderToMarkdownBase):
    """
    Parse <h2> tag and convert to ##
    """
    def __init__(self):
        super().__init__(elem="h2")

class H3ToMarkdown(HeaderToMarkdownBase):
    """
    Parse <h3> tag and convert to ###
    """
    def __init__(self):
        super().__init__(elem="h3")
        
class H4ToMarkdown(HeaderToMarkdownBase):
    """
    Parse <h4> tag and convert to ####
    """
    def __init__(self):
        super().__init__(elem="h4")
        
class HrefToMarkdown(HtmlToMarkdown):
    """
    Parse an <a> tag into [text](link).  Any extra attributes (besides href)
    are placed into a <span> in the markdown.
    """
    
    def __init__(self):        
        super().__init__(elem="a")
        self._mangle_name = "__mangled_link__"
        self._mangle_search = "<a"
        
    def process(self,html,protect_div=True,right_strip=True):
        return super().process(html,protect_div,right_strip)
        
    def _substitute(self,before,inside,after,m,counter,attributes):
        
        src = re.search("href=\".*?\"",attributes)
        if src is not None:
            src_file = src.group(0).split("=")[1]
            src_file = re.sub("\"","",src_file)
            attributes = re.sub("href=\".*?\"","",attributes)
            inside = f"![{inside}]({src_file}) {attributes}"            
        else:
            # If we can't find a source, mangle the image
            inside = re.sub(self._mangle_search,self._mangle_name,inside)
        
        if attributes is not "":
            inside = f"<span {attributes}>{inside}</span>"
        
        return f"{before}{inside}{after}\n"
    
    
class OlToMarkdown(HtmlToMarkdown):
    
    def __init__(self):
        super().__init__(elem="ol")
        
    def process(self,html,protect_div=True,right_strip=True):
        return super().process(html,protect_div,right_strip)
        
    def _substitute(self,before,inside,after,m,counter,attributes):

        # Parse <li> elements within the <ol></ol>
        inside_processor = LiToMarkdown(ordered=True)
        inside = inside_processor.process(inside)
        
        # Strip out blank lines, if they appeared
        inside = "\n".join([l for l in inside.split("\n") if l.strip() != ""])
        
        return f"{before}{inside}{after}\n"
        
class UlToMarkdown(HtmlToMarkdown):
    
    def __init__(self):
        super().__init__(elem="ul")
        
    def process(self,html,protect_div=True,right_strip=True):
        out = super().process(html,protect_div,right_strip)
        
        out_lines = []
        pattern = re.compile("\s*?\+")
        lines = out.split("\n")
        for l in lines:
            if pattern.match(l):
                out_lines.append(f" {l.lstrip()}")
            else:
                out_lines.append(l)
        return "\n".join(out_lines)
        
    def _substitute(self,before,inside,after,m,counter,attributes):

        # Parse <li> elements within the <ol></ol>
        inside_processor = LiToMarkdown(ordered=False)
        inside = inside_processor.process(inside)
        
        # Strip out blank lines, if they appeared
        inside = "\n".join([l for l in inside.split("\n") if l.strip() != ""])
        
        return f"{before}{inside}{after}\n"
    
class LiToMarkdown(HtmlToMarkdown):
    
    def __init__(self,ordered=False):
        self._ordered = ordered
        super().__init__(elem="li")
        
    def process(self,html,protect_div=True,right_strip=False):
        out = super().process(html,protect_div,right_strip)
        
        out_lines = []
        pattern = re.compile("\s*?\+")
        lines = out.split("\n")
        for l in lines:
            if pattern.match(l):
                out_lines.append(f" {l.lstrip()}")
            else:
                out_lines.append(l)
        return "\n".join(out_lines)
        
    def _substitute(self,before,inside,after,m,counter,attributes):

        if self._ordered:
            bullet = f"{counter}."
        else:
            bullet = "+"

        if attributes is not "":
            inside = f"<span {attributes}>{inside}</span>"
            
        inside = f" {bullet} {inside}\n"

        return f"{before}{inside}{after}"

class ImgToMarkdown(HtmlToMarkdown):
    
    def __init__(self):
        super().__init__(elem="img")
        self._pattern_str = f"<img.*?/>"
        self._single_pattern = re.compile(self._pattern_str)
        
        self._mangle_name = "__mangled_img__"
        self._mangle_search = "<img"
        self._sm_name = "sm.image"
        
    def process(self,html,protect_div=True,right_strip=True):
        
        html = super().process(html,protect_div,right_strip)
        return re.sub(self._mangle_name,self._mangle_search,html)
    
    def _substitute(self,before,inside,after,m,counter,attributes):
        
        src = re.search("src=\".*?\"",attributes)
        if src is not None:
            src_file = src.group(0).split("=")[1]
            src_file = re.sub("\"","",src_file)
            attributes = re.sub("src=\".*?\"","",attributes)
            inside = f"![{self._sm_name}]({src_file}) {attributes}"            
        else:
            # If we can't find a source, mangle the image
            inside = re.sub(self._mangle_search,self._mangle_name,inside)
        
        return f"{before}{inside}{after}\n"
    
        
def clean_up_line_breaks(html):

    tmp = re.sub("\n\n","__double__",html)
    tmp = re.sub("\n","__single__",tmp)
    tmp = re.sub("(__double__)(__double__)+","__double__",tmp)
    tmp = re.sub("(__double__)(__single__)+","__double__",tmp)
    tmp = re.sub("__double__","\n\n",tmp)
    tmp = re.sub("__single__","\n",tmp)
    
    return re.sub("<br/>","\n\n",tmp)
    
    
def process_dicussion(html):

    divs = get_elements("div",html)

    slides_pattern = build_attribute_pattern("class","slides")

    processors = [ParagraphToMarkdown(),
                  SmallToMarkdown(),
                  EmToMarkdown(),
                  #BoldToMarkdown(),
                  CommentToMarkdown(),
                  H1ToMarkdown(),
                  H2ToMarkdown(),
                  H3ToMarkdown(),
                  H4ToMarkdown(),
                  HrefToMarkdown(),
                  OlToMarkdown(),
                  UlToMarkdown(),
                  ImgToMarkdown()]
                  
    out = []
    for d in divs:
        if slides_pattern.search(d[0][0]):
            no_slides = "{}{}".format(html[:d[0][2]],
                                      html[d[1][1]:])

            slides_html = html[d[0][2]:d[1][1]]
            sections = get_elements("section",slides_html)
            for s in sections:

                this_slide = slides_html[s[0][2]:s[1][1]]
                if this_slide.strip() == "":
                    out.append(">>>\n")
                    continue
                
                this_slide = strip_leading_space(this_slide)
                
                for p in processors:
                    this_slide = p.process(this_slide)

                this_slide = clean_up_line_breaks(this_slide)

                out.append(this_slide.strip())
                out.append(">>>\n")
    
    md = "\n".join(out)
    return md

In [6]:

for h in glob.glob("../discussion*.html"):
    
    html = open(h).read()
    md = process_dicussion(html)
    
    dir_name = os.path.split(h)[-1].split(".")[0]
    os.mkdir(dir_name)
    
    dev_dir = os.path.join(dir_name,"dev")
    os.mkdir(dev_dir)
    
    files = re.findall("presentation-data.*?[\")]",md)
    for f in files:
        this_f = os.path.join(f"../{f[:-1]}")
        shutil.copy(this_f,dev_dir)
        raw_name = os.path.split(this_f)[-1]
        md = re.sub(f[:-1],raw_name,md)
        
    
    g = open(os.path.join(dir_name,"dev","talk.md"),"w")
    g.write(md)
    g.close()


        

    

In [None]:
line = "what if I @test this out@ dont@test@"

pattern = re.compile("@.*?@")
matches = pattern.findall(line)
for m in matches:
    replacement = f"<small>{m[1:-1]}</small>"
    line = pattern.sub(replacement,line)

print(line)

In [None]:
import re
pattern = re.compile("\s*?\+")
pattern.match("t +")