# build-vega-lite-docset

> Converts a built copy of the vega-lite static Jekyll site to a docset directory


1. Clone `vega-lite` as a sibling folder of this repository using the changes from [this PR](https://github.com/vega/vega-lite/pull/7642)
2. In that directory, run `yarn install && yarn docset`
3. In your python virtual environment, add these dependencies: `pip install bs4 lxml`
4. Using Jupyter is optional, but recommended for ease of debugging

In [1]:
# Make a folder to hold a copy of the built site
!mkdir vega-docs

mkdir: vega-docs: File exists


In [2]:
# initial import of built site assets
!rsync -r ../vega/docs/_site/* ./vega-docs

In [3]:
import glob
import os
from urllib.parse import quote
from os import path

import sqlite3

from bs4 import BeautifulSoup
# from bs4.element import NavigableString

In [4]:
DEBUG = True

In [5]:
DOCSET_NAME = 'vega'

In [6]:
DOCSET_PATH = f"{DOCSET_NAME}.docset/Contents/Resources/Documents/"

In [7]:
# Create a folder to hold your docset info
!mkdir -p $DOCSET_PATH

In [8]:
VEGA_DOCSET = f"{DOCSET_NAME}.docset"
PLIST_PATH = f"{DOCSET_NAME}.docset/Contents/Info.plist"
SQLITE_PATH = f"{DOCSET_NAME}.docset/Contents/Resources/docSet.dsidx"

In [9]:
# Create Info.plist
docsetIndex = '''<?xml version="1.0" encoding="UTF-8"?>
<plist version="1.0">
  <dict>
    <key>CFBundleIdentifier</key>
    <string>vega</string>
    <key>CFBundleName</key>
    <string>Vega</string>
    <key>DocSetPlatformFamily</key>
    <string>vega</string>
    <key>isDashDocset</key>
    <true />
    <key>isJavaScriptEnabled</key>
    <true/>
    <key>dashIndexFilePath</key>
    <string>./index.html</string>
    <key>DashDocSetFallbackURL</key>
    <string>https://vega.github.io/vega/</string>
    <key>DashDocSetPlayURL</key>
    <string>https://vega.github.io/editor/#/custom/vega/</string>
    <key>DashDocSetFamily</key>
    <string>dashtoc</string>
  </dict>
</plist>
'''

with open('vega.docset/Contents/Info.plist', 'w') as fp:
    fp.write(docsetIndex)

In [10]:

# string to handle with replacement
DOCSET_BASE = 'DOCSET_BASE_TO_REPLACE'

def getDepthString(depth):
    return '.' + depth * "/.."


def update_relative_paths(filename):
    with open(filename) as fp:
        text = fp.read()
        
    depth = len(filename.split('/')) - 2
    depthString = getDepthString(depth)

    return text.replace(DOCSET_BASE, depthString)

In [11]:
# Convert absolute to relative URLs
matches = glob.glob(f"vega-docs/**/*.html", recursive=True)
print(len(matches))
# now let's clean up the source files
for name in matches:
    if DEBUG:
        print('Moving relative urls', name)
    processed = update_relative_paths(name)
    with open(name, 'w') as fp:
        fp.write(processed)

188
Moving relative urls vega-docs/index.html
Moving relative urls vega-docs/usage/index.html
Moving relative urls vega-docs/usage/interpreter/index.html
Moving relative urls vega-docs/usage/internet-explorer/index.html
Moving relative urls vega-docs/about/index.html
Moving relative urls vega-docs/about/research/index.html
Moving relative urls vega-docs/about/video/index.html
Moving relative urls vega-docs/about/projects/index.html
Moving relative urls vega-docs/about/vega-and-d3/index.html
Moving relative urls vega-docs/about/code-of-conduct/index.html
Moving relative urls vega-docs/docs/index.html
Moving relative urls vega-docs/docs/schemes/index.html
Moving relative urls vega-docs/docs/event-streams/index.html
Moving relative urls vega-docs/docs/types/index.html
Moving relative urls vega-docs/docs/axes/index.html
Moving relative urls vega-docs/docs/marks/index.html
Moving relative urls vega-docs/docs/marks/shape/index.html
Moving relative urls vega-docs/docs/marks/rect/index.html
Mo

In [12]:
# Then get entries to add to the database
def get_soup(filename):
    with open(filename) as fp:
        text = fp.read()
    return BeautifulSoup(text, 'lxml')

In [13]:
def get_guide_entries():
    tutorial_soup = get_soup("vega-docs/tutorials/index.html")
    tutorial_links = tutorial_soup.find("section", {"class": "page-content"}).findAll(
        "a"
    )

    tutorial_entries = [
        [
            l.getText().strip(),
            "Guide",
            path.join("/tutorials", l["href"], "index.html")
            if "https://" not in l["href"]
            else l["href"],
        ]
        for l in tutorial_links
    ]

    # Hardcoding since there isn't a clean directory menu to all the subpages for this
    # section.
    usage_entries = [
        ['Usage: Main', 'Guide', '/usage/index.html'],
        ['Usage: Internet Explorer', 'Guide', '/usage/internet-explorer/index.html'],
        ['Usage: Interpreter', 'Guide', '/usage/interpreter/index.html'],
        # Also passing in root to API
        # TODO: rewrite all URLs in terms of relative paths
        ['Documentation: Main', 'Guide', '/docs/index.html'],
        # bonuses from API namespace
        ['Debugging', 'Guide', '/docs/api/debugging/index.html'],
        ['Extensibility', 'Guide', '/docs/api/extensibility/index.html']
    ]

    about_soup = get_soup("vega-docs/about/index.html")
    about_links = about_soup.find("ul", {"class": "sidebar-nav"}).findAll(
        "a"
    )

    about_entries = [
        [
            l.getText().strip(),
            "Guide",
            path.join('/', path.relpath(l["href"], "../"), 'index.html')
        ]
        for l in about_links
    ]

    return [
        *tutorial_entries,
        *usage_entries,
        *about_entries
        # other special entries 
        # ['Comparison', 'Guide', '/comparison.html'],
        #  ['Overview', 'Guide', '/index.html'],   # not needed, is main page
    ]

In [14]:
get_guide_entries()

[['Let’s Make a Bar Chart', 'Guide', '/tutorials/bar-chart/index.html'],
 ['A Guide to Guides: Axes & Legends',
  'Guide',
  'https://observablehq.com/@vega/a-guide-to-guides-axes-legends-in-vega'],
 ['Mapping Airport Connections', 'Guide', '/tutorials/airports/index.html'],
 ['How Vega Works', 'Guide', 'https://observablehq.com/@vega/how-vega-works'],
 ['Usage: Main', 'Guide', '/usage/index.html'],
 ['Usage: Internet Explorer', 'Guide', '/usage/internet-explorer/index.html'],
 ['Usage: Interpreter', 'Guide', '/usage/interpreter/index.html'],
 ['Documentation: Main', 'Guide', '/docs/index.html'],
 ['Debugging', 'Guide', '/docs/api/debugging/index.html'],
 ['Extensibility', 'Guide', '/docs/api/extensibility/index.html'],
 ['About', 'Guide', '/about/index.html'],
 ['Video', 'Guide', '/about/video/index.html'],
 ['Projects', 'Guide', '/about/projects/index.html'],
 ['Research', 'Guide', '/about/research/index.html'],
 ['Vega and D3', 'Guide', '/about/vega-and-d3/index.html'],
 ['Code of C

In [15]:
#get_sample_entries()

In [16]:
def get_sample_entries():
    examples_soup = get_soup('vega-docs/examples/index.html')
    links = examples_soup.find('section', {"class": "page-content"}).findAll('a')
    
    entries = [
        [f"{l.getText().strip()}",
        'Sample',
        os.path.join('/examples/', l['href'], 'index.html')]
        for l in links  
    ]
    
    return entries

Notes on docs

- API has 1 level of nesting per page
- Spec has 1 level of nesting for most, except for Mark and Transforms

In [17]:
#get_structures_entries()

In [18]:
# Process the "docs" part of the site directory
def get_structures_entries():

    docs_soup = get_soup('vega-docs/docs/index.html')

    spec_table = docs_soup.find('h2', {"id": 'specification-reference'}).findNext('table')
    cat_links = spec_table.findAll('a') 
    cat_entries = [
        [f"{l.getText().strip()}",
        'Category',
        path.join('/docs', l['href'], 'index.html')
        ]
        for l in cat_links  
    ]

    # next, we process the JS API
    # Strictly speaking Debugging should be a guide.
    api_table = docs_soup.find('h2', {"id": 'vega-api-reference'}).findNext('table')
    api_links = api_table.findAll('a') 
    ignoreModules = ['Debugging', 'Extensibility'] # not vega packages, belong in guide
    module_entries = [
        [f"{l.getText().strip()}",
        'Module',
        path.join('/docs',l['href'], 'index.html')
        ]
        # this code is quite scrappy but OK for now
        for l in api_links if not any(word in l.getText() for word in ignoreModules) 
    ]

    
    #  Lets get the marks
    marks_soup = get_soup("vega-docs/docs/marks/index.html")
    marks_nav = marks_soup.find('aside').find('ul', {"class": 'sub-menu-nav'})
    marks_links = marks_nav.findAll('a')
    marks_entries = [
        [f"mark.{l.getText().strip()}",
        'Interface',
        path.join('/', path.relpath(l['href'], '../../'), 'index.html')
        ]
        for l in marks_links
    ]

    # close with the transforms
    transforms_soup = get_soup("vega-docs/docs/transforms/index.html")
    transforms_nav = transforms_soup.find('aside').find('ul', {"class": 'sub-menu-nav'})
    transforms_links = transforms_nav.findAll('a')
    transforms_entries = [
        [f"transform.{l.getText().strip()}",
        'Interface',
        path.join('/', path.relpath(l['href'], '../../'), 'index.html')
        ]
        for l in transforms_links
    ]

            
    return [
        *cat_entries,
        *module_entries,
        *marks_entries,
        *transforms_entries,
    ]
    

In [19]:
all_entries = [*get_guide_entries(), *get_sample_entries(), *get_structures_entries()]

In [20]:
# Add 1 row
def addRow(cursor, name, rowType, path):
    statement = f"INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES ('{name}', '{rowType}', '{path}');"
    if DEBUG:
        print(statement)
    cursor.execute(statement)

In [21]:
try:
    # TODO: evaluate if this needs to become indempotent
    connection = sqlite3.connect(SQLITE_PATH)
    cursor = connection.cursor()
    
    # Uncomment these on first run
    cursor.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);')
    cursor.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);')

    # Debug row
    #     addRow(cursor, 'Getting Started' ,'Guide', '/tutorials/getting_started.html#tutorial-overview')
    for entry in all_entries:
        addRow(cursor, *entry)
    
    connection.commit()
    
except sqlite3.Error as error:
    print("Error while executing sqlite script", error)
    
finally:
    if connection:
        connection.close()
        print('connection closed')

Error while executing sqlite script table searchIndex already exists
connection closed


In [22]:
# logos from https://github.com/vega/logos
#!curl https://github.com/vega/logos/blob/master/assets/VG_Color%40128.png  
!cp img/* $VEGA_DOCSET/

In [23]:
def enrich_table_of_contents(filename):
    with open(filename) as fp:
        text = fp.read()

    # get the soup
    soup = BeautifulSoup(text, "lxml")
    content = soup.find("section", {"class": "page-content"}) or soup.find(
        "div", {"class": "page-content"}
    )

    if content is None:
        return text

    category_anchors = content.findAll("h2")
    for anchor in category_anchors:
        # ok to have double links
        # if anchor.find("a"):
        #     continue
        # safe quote to handle / path characters
        # https://stackoverflow.com/questions/1695183/how-to-percent-encode-url-parameters-in-python
        safeName = quote(anchor.getText().strip(), safe="")
        dashAnchor = soup.new_tag("a")
        dashAnchor["name"] = f"//apple_ref/cpp/Category/{safeName}"
        dashAnchor["class"] = "dashAnchor"
        anchor.append(dashAnchor)

    section_anchors = content.findAll("h3")
    for anchor in section_anchors:

        if anchor.find("a"):
            continue

        safeName = quote(anchor.getText().strip(), safe="")
        if safeName == "Usage":
            continue # notice this can get spammy, like in config

        dashAnchor = soup.new_tag("a")
        dashAnchor["name"] = f"//apple_ref/cpp/Section/{safeName}"
        dashAnchor["class"] = "dashAnchor"
        anchor.append(dashAnchor)

    function_anchors = content.findAll("a", text="#")
    for anchor in function_anchors:
        safeName = quote(anchor["name"].strip(), safe="")
        # bad for dom to have extra tag, but its ok for us in this case so that in-page links
        # still work with the name reference.
        dashAnchor = soup.new_tag("a")
        dashAnchor["name"] = f"//apple_ref/cpp/Section/{safeName}"
        dashAnchor["class"] = "dashAnchor"
        anchor.append(dashAnchor)

    print(
        "Added",
        len(section_anchors),
        "sections",
        len(category_anchors),
        "functions",
        len(function_anchors),
    )

    return str(soup)


In [24]:
def remove_page_elements(filename):
    with open(filename) as fp:
        text = fp.read()

    # remove sidebar nav elements
    soup = BeautifulSoup(text, 'lxml')
    sidebar = soup.find('aside', {"class": "page-sidebar"})
    
    if sidebar:
        print("removing sidebar from ", filename)
        sidebar.decompose()
    
    return str(soup)

In [25]:
# Add "toc" entries for each page.
matches = glob.glob(f"vega-docs/**/*.html", recursive=True)
print(len(matches))

for name in matches:
    if DEBUG:
        print('Enrich table of contents', name)
    processed = enrich_table_of_contents(name)
    
    with open(name, 'w') as fp:
        fp.write(processed)

188
Enrich table of contents vega-docs/index.html
Added 0 sections 0 functions 0
Enrich table of contents vega-docs/usage/index.html
Added 7 sections 4 functions 0
Enrich table of contents vega-docs/usage/interpreter/index.html
Added 1 sections 0 functions 0
Enrich table of contents vega-docs/usage/internet-explorer/index.html
Added 0 sections 2 functions 0
Enrich table of contents vega-docs/about/index.html
Added 0 sections 1 functions 0
Enrich table of contents vega-docs/about/research/index.html
Added 0 sections 3 functions 0
Enrich table of contents vega-docs/about/video/index.html
Added 5 sections 0 functions 0
Enrich table of contents vega-docs/about/projects/index.html
Added 0 sections 0 functions 0
Enrich table of contents vega-docs/about/vega-and-d3/index.html
Added 0 sections 0 functions 0
Enrich table of contents vega-docs/about/code-of-conduct/index.html
Added 0 sections 6 functions 0
Enrich table of contents vega-docs/docs/index.html
Added 0 sections 2 functions 0
Enrich t

In [26]:
matches = glob.glob(f"vega-docs/**/*.html", recursive=True)
if DEBUG:
    print('Files', len(matches))
for name in matches:    
    # https://stackoverflow.com/questions/5598524/can-i-remove-script-tags-with-beautifulsoup
    processed = remove_page_elements(name)
    
    with open(name, 'w') as fp:
        fp.write(processed)

Files 188
removing sidebar from  vega-docs/about/index.html
removing sidebar from  vega-docs/about/research/index.html
removing sidebar from  vega-docs/about/video/index.html
removing sidebar from  vega-docs/about/projects/index.html
removing sidebar from  vega-docs/about/vega-and-d3/index.html
removing sidebar from  vega-docs/about/code-of-conduct/index.html
removing sidebar from  vega-docs/docs/schemes/index.html
removing sidebar from  vega-docs/docs/event-streams/index.html
removing sidebar from  vega-docs/docs/types/index.html
removing sidebar from  vega-docs/docs/axes/index.html
removing sidebar from  vega-docs/docs/marks/index.html
removing sidebar from  vega-docs/docs/marks/shape/index.html
removing sidebar from  vega-docs/docs/marks/rect/index.html
removing sidebar from  vega-docs/docs/marks/trail/index.html
removing sidebar from  vega-docs/docs/marks/area/index.html
removing sidebar from  vega-docs/docs/marks/group/index.html
removing sidebar from  vega-docs/docs/marks/path/in

In [27]:
from urllib.parse import urlparse

In [28]:
#urlparse('../../foobar/index.html#specification')

In [29]:
# tack on HTML paths
def update_html_paths(filename):
    with open(filename) as fp:
        text = fp.read()

    # remove sidebar nav elements
    soup = BeautifulSoup(text, 'lxml')
    links = soup.findAll('a')

    if links is None:
      return text

    numUpdated = 0
  
    for link in links:
      ref = link.get('href')
      if ref and not "https://" in ref and ref.endswith('/'):
        link['href'] = ref + 'index.html'
        numUpdated += 1
      elif ref and ref.startswith('file:/') and not ref.endswith('.json'):
        pass # ignore schema updates
      elif ref and ref.startswith('#'):
        pass # ignore self links
      elif ref and not 'http' in ref and not ref.endswith('.html'):
        # link['href'] = ref + '/index.html'
        parsed = urlparse(ref)
        if parsed.fragment:
          subpath = parsed.path + 'index.html' if parsed.path.endswith('/') else parsed.path + '/index.html'
          link['href'] = subpath + f"#{parsed.fragment}"
        else:
          link['href'] = ref + '/index.html'

        numUpdated += 1

    print(f"updated {numUpdated}")

    
    return str(soup)

In [30]:
# updated 
matches = glob.glob(f"vega-docs/**/*.html", recursive=True)
print(len(matches))

for name in matches:
    if DEBUG:
        print('Update HTML paths', name)
    processed = update_html_paths(name)
    
    with open(name, 'w') as fp:
        fp.write(processed)

188
Update HTML paths vega-docs/index.html
updated 19
Update HTML paths vega-docs/usage/index.html
updated 13
Update HTML paths vega-docs/usage/interpreter/index.html
updated 6
Update HTML paths vega-docs/usage/internet-explorer/index.html
updated 7
Update HTML paths vega-docs/about/index.html
updated 12
Update HTML paths vega-docs/about/research/index.html
updated 6
Update HTML paths vega-docs/about/video/index.html
updated 6
Update HTML paths vega-docs/about/projects/index.html
updated 10
Update HTML paths vega-docs/about/vega-and-d3/index.html
updated 8
Update HTML paths vega-docs/about/code-of-conduct/index.html
updated 8
Update HTML paths vega-docs/docs/index.html
updated 43
Update HTML paths vega-docs/docs/schemes/index.html
updated 73
Update HTML paths vega-docs/docs/event-streams/index.html
updated 22
Update HTML paths vega-docs/docs/types/index.html
updated 51
Update HTML paths vega-docs/docs/axes/index.html
updated 111
Update HTML paths vega-docs/docs/marks/index.html
updated

In [31]:
#!rsync -r ../vega-lite/site/_site/* ./vega-lite-docs

In [32]:
# Move HTML into the right directory
#!rsync -r ../vega-lite/site/_site/* ./vega-lite-docs
# exclude static data files since dash doesn't run a local fileserver
!rsync -r ./vega-docs/* $DOCSET_PATH \
     --exclude 'data/*.csv' --exclude 'data/*.arrow' --exclude 'data/*.json' --exclude 'data/*.tsv' \
     --exclude 'sitemap.xml' --exclude '*min.js.map' --exclude 'rollup.config.js' --exclude 'releases/' --exclude 'robots.txt' --exclude 'vega-schema.json'

In [33]:
# !mkdir ../Dash-User-Contributions/docsets/Vega/

In [34]:
# Prepare to publish in adjacent user contribs repository
#!tar --exclude='.DS_Store' -cvzf vega.tgz vega.docset
#!cp vega.tgz ../Dash-User-Contributions/docsets/Vega/