# Notebook to generate fastai visual docs

In [11]:
!pip install requests > /dev/null
!pip install beautifulsoup4 > /dev/null
!pip install fastai --upgrade > /dev/null

## 1. Export SVG versions of the PowerPoint slides

1. Open "fastai_docs_v1.pptx" in Microsoft PowerPoint
2. Save as ... > SVG Format > export all slides, in the current directory
3. .svg files are generated in the ./fastai_docs_v1 subdirectory

In [1]:
from fastai.data.all import *

p = Path('./fastai_docs_v1')
files = get_files(p, extensions=".SVG")
files

(#75) [Path('fastai_docs_v1/Diapositive1.SVG'),Path('fastai_docs_v1/Diapositive10.SVG'),Path('fastai_docs_v1/Diapositive11.SVG'),Path('fastai_docs_v1/Diapositive12.SVG'),Path('fastai_docs_v1/Diapositive13.SVG'),Path('fastai_docs_v1/Diapositive14.SVG'),Path('fastai_docs_v1/Diapositive15.SVG'),Path('fastai_docs_v1/Diapositive16.SVG'),Path('fastai_docs_v1/Diapositive17.SVG'),Path('fastai_docs_v1/Diapositive18.SVG')...]

## 2. Rename the SVG files with slides titles

1. Open "titles.csv" in a table editor (separator ';' and UTF8 encoding)
2. Update the contents to match the latest version of  "fastai_docs_v1.pptx" (and save if needed)
  - column 0 : slide index (starting at 1) 
  - column 1 : section title
  - column 2 : slide title
3. Execute the code below to rename all svg files

You must be careful if you change a title in this table: the deep links to a specific svg picture will change => any user with a specific bookmark will be redirected to the top of the page.

In [2]:
titles = pd.read_csv("titles.csv", sep=';', header=None)
titles

Unnamed: 0,0,1,2
0,1,Concepts,Concepts - Data loading
1,2,Concepts,Concepts - Model training
2,3,Concepts,Concepts - Learner lifecycle
3,4,Learner,Learner - Create an instance
4,5,Learner,"Learner - Init, Attributes"
...,...,...,...
70,71,Loss,"Loss functions - Classification, Regression"
71,72,Summary,Summary - DataBlock
72,73,Summary,Summary - DataLoaders
73,74,Summary,Summary - Learner


In [3]:
# Extract slide number from the file names generated by PowerPoint
slide_numbers_and_files = L(zip(files.map(lambda n: int(n.name[11-len(n.name):-4])),files))

# Create suitable file names from the slides titles
# Rename all svg files with their number and title
target_filenames = L()
for num,file in slide_numbers_and_files:
    safe_title = titles[2].iloc[num-1].replace(' - ','-').replace(',','').replace(' ','_').replace('/','_')
    target_filename = ('0' if num<=9 else '')+str(num)+'_'+safe_title+'.svg'
    target_filenames.append((num,target_filename))
    file.rename(p/target_filename)
                                                                                                   
# Add these filenames to the titles table for later use                                                                                                   
target_filenames.sort(key=lambda t:t[0])
titles[3] = target_filenames.itemgot(1)
titles

# Reload list of files
files = get_files(p, extensions=".SVG")

## 3. Insert links to fastai docs or code in the SVG files

Note : this is implemented as a post-processing step because today - May 2021 - PowerPoint doesn't support links in its SVG export feature.

### 3.1 Collect all links and anchors from https://docs.fast.ai in a single csv file (optional)

This is useful in the next step to speed up the process of assigning a useful link to each visual element in the slides.

With the generated "docs.fast.ai.csv" file opened in a table editor (separator ';' and UTF8 encoding), you can locate very quickly the link you need and copy-paste it.

To do this, we simply crawl the docs.fast.ai website ...

In [None]:
import requests
from bs4 import BeautifulSoup

rooturl = "https://docs.fast.ai/"

html = requests.get(url=rooturl).text
page = BeautifulSoup(html, 'html.parser')

title1 = ""
title2 = "" 

with open('docs.fast.ai.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=';', quotechar='\\', quoting=csv.QUOTE_MINIMAL)
    
    for link in page.find("ul",attrs={"id":"mysidebar"}).find_all("a"):
        txt = link.string
        url = link['href']
        depth = len(list(link.parents))
        if depth==9:
            title1 = txt
            title2 = ""
            continue
        elif depth==11 and url=="#":
            title2 = txt
            continue
        elif depth==11 and url!="#":
            title2 = ""

        baseurl = f"https://docs.fast.ai{url}"
        writer.writerow([title1,title2,txt,baseurl])
        print("Crawling : "+baseurl+"\r")
        
        html2 = requests.get(url=baseurl).text
        page2 = BeautifulSoup(html2, 'html.parser')

        for header in page2.find_all(["h2","h3","h4"]):
            strs = list(header.strings)
            if len(strs)==1:
                txt2 = strs[0]
            else:
                txt2 = "".join(strs[:-1])
            url2 = header['id']
            depth = int(header.name[1:])

            writer.writerow(['']*depth+[txt2,f"{baseurl}#{url2}"])

### 3.2 Update links table

In [7]:
# Utility functions to parse SVG xml files
from xml.dom.minidom import parse

def get_descendant_nodes(context_node, predicate):
    if not context_node:
        yield None
    for child in context_node.childNodes:
        if predicate(child):
            yield child
        yield from get_descendant_nodes(child, predicate)

def get_text_value(context_node, default=None):
    texts_nodes = get_descendant_nodes(context_node, lambda n: n.nodeType == n.TEXT_NODE)
    text_value = ' '.join([str.strip(t.nodeValue) for t in texts_nodes])
    return text_value if text_value else default

1. Refresh the links table in "links.csv" by running the code below. 

The idea is :
  - scan all SVG files
  - locate all text elements in the SVG file
  - check if the text element was already referenced in the links table
    - if it wasn't : add it to the table and flag it "NEW"
    - if it was : flag it "OK"
  - after all SVG files were scanned, mark all remaining text elements in the table "REMOVED" 

In [24]:
links = pd.read_csv('links.csv', sep=';', header=None, encoding='utf-8', keep_default_na=False)

In [None]:
links[3] = "REMOVED"

for file in files:
    dom = parse(open(file, encoding='utf-8'))
    for idx,textElt in enumerate(dom.getElementsByTagName('text')):
        text = get_text_value(textElt).replace('"','')
        rows = links[(links[0]==file.name) & (links[1]==text)]   
        if len(rows.index)>0:
            links.loc[(links[0]==file.name) & (links[1]==text),3] = "OK"
        else:
            links = links.append({0:file.name, 1:text, 2:'?', 3:"NEW"}, ignore_index=True)
    print(f"{file.name} -> {idx} labels\r")

links = links.reset_index().sort_values([0,'index']).drop(columns='index')

links[3].value_counts()

In [152]:
links.to_csv('links.csv', sep=';', header=None, index=False, encoding='utf-8-sig')

2. Open "links.csv" in a table editor (separator ';' and **UTF8 encoding**)
  - column 0 : SVG file name
  - column 1 : text element
  - column 2 : link to fastai doc -> '?', then URL or empty if no link
  - column 3 : line status -> NEW, OK, REMOVED 

Note : be careful to open and save the CSV file in **UTF-8 encoding** !


3. Locate the new lines with 'NEW' in column 3 or the lines not yet documented with '?' in column 2
  - add a link to fastai doc in column 2 (or leave the cell empty)
  - use 'docs.fast.ai.csv' to quickly find the doc URL
  - you can try to locate a similar text element with the status REMOVED to copy the previously selected link
  - optionnaly reorder the lines to group the elements with links at the top for each SVG file


4. After all lines have been updated :
  - delete all REMOVED lines
  - drop the colum 3 (status)
  - save a new version of 'links.csv'

### 3.3 Process SVG files : add links

In [5]:
def wrap_with_link(dom, textElt, url):
    parent = textElt.parentNode
    link = dom.createElement('a')
    link.setAttributeNS("http://www.w3.org/1999/xlink", "href", url)      
    if textElt.previousSibling.tagName=="rect":
        rectElt = textElt.previousSibling
        parent.removeChild(rectElt)
        parent.replaceChild(link, textElt)        
        link.appendChild(rectElt)
        link.appendChild(textElt)
    else:
        parent.replaceChild(link, textElt)
        link.appendChild(textElt)

In [None]:
for file in files:
    dom = parse(open(file, encoding='utf-8'))
    print(file.name)
    changed = False
    for idx,textElt in enumerate(dom.getElementsByTagName('text')):
        text = get_text_value(textElt).replace('"','')
        rows = links[(links[0]==file.name) & (links[1]==text)]   
        if len(rows.index)>0:
            url = rows.iloc[0,2]
            if len(url)>0 and url!='?':
                wrap_with_link(dom,textElt,url)
                changed = True
    if changed:
        with open(file,'w') as f:
            f.write(dom.toxml())
            print(" -> updated")

## 4. Generate the HTML page and menu

In [159]:
menuhtml = "<h1>fastai v2</h1>\n"
for title1 in titles[1].unique():
    menuhtml += f"<h2>{title1}</h2>\n"
    menuhtml += "<ul>"
    for title2 in titles[2][titles[1]==title1]:
        num = titles[3][(titles[1]==title1) & (titles[2]==title2)].item()[:2]
        menuhtml += f"<li><a href=\"#{num}\">{title2}</a></li>\n"
    menuhtml += "</ul>"

In [160]:
mainhtml = ""
for title1 in titles[1].unique():
    mainhtml += f"<h2>{title1}</h2>\n"
    for title2 in titles[2][titles[1]==title1]:
        num = titles[3][(titles[1]==title1) & (titles[2]==title2)].item()[:2]
        mainhtml += f"<p><a href=\"/images/fastaidocs/{titles[3][(titles[1]==title1) & (titles[2]==title2)].item()}\"><img id=\"{num}\" src=\"/images/fastaidocs/{titles[3][(titles[1]==title1) & (titles[2]==title2)].item()}\"/></a></p>\n"

In [161]:
main1html = """---
permalink: /fastaidocs/
layout: null
search_exclude: true
---

<html>
<head>
<style>
/* The sidenav */
.sidenav {
  padding: 10px;
  height: 100%;
  width: 250px;
  position: fixed;
  z-index: 1;
  top: 0;
  left: 0;
  color: #7F7F7F;
  background-color: #F7F7F7;
  overflow-x: hidden;
  border-right: solid 1px;
}

ul {
  padding-left: 10px;
}

li {
  list-style-type: none;
  padding-left: 0px;
  padding-bottom: 5px;
}

a {
  outline: none;
  text-decoration: none;
  color: #7F7F7F;
}

a:hover {
  border-bottom: 1px solid;
}

/* Page content */
.main {
  margin-left: 250px; /* Same as the width of the sidenav */
  padding: 0px 10px;
}

.main h2 {
  color: #7F7F7F;
  background-color: #F7F7F7;
  font-size: 50px;
  border-top: 1px solid;
  border-bottom: 1px solid;
  padding: 10px;
}

.main img {
  width: 100%;
  border: 1px solid #E7E7E7;
}

</style>
</head>

<body>

<div class="sidenav">

"""

In [162]:
main2html = """
</div>

<div class="main">
"""

In [163]:
main3html = """
</div>

</body>

</html>"""

In [164]:
text_file = open("fastaidocs.html", "w")
text_file.write(main1html + menuhtml + main2html + mainhtml + main3html)
text_file.close()