# build-vega-lite-docset

> Converts a built copy of the vega-lite static Jekyll site to a docset directory


1. Clone `vega-lite` as a sibling folder of this repository using the changes from [this PR](https://github.com/vega/vega-lite/pull/7642)
2. In that directory, run `yarn install && yarn docset`
3. In your python virtual environment, add these dependencies: `pip install bs4 lxml`
4. Using Jupyter is optional, but recommended for ease of debugging

In [1]:
# Make a folder to hold a copy of the built site
!mkdir vega-lite-docs

mkdir: vega-lite-docs: File exists


In [2]:
# initial import of built site assets
!rsync -r ../vega-lite/site/_site/* ./vega-lite-docs

In [3]:
import glob
import os
from urllib.parse import quote

import sqlite3

from bs4 import BeautifulSoup
from bs4.element import NavigableString

In [None]:
DEBUG = True

In [4]:
DOCSET_NAME = 'vega-lite'

In [5]:
DOCSET_PATH = f"{DOCSET_NAME}.docset/Contents/Resources/Documents/"

In [6]:
# Create a folder to hold your docset info
!mkdir -p $DOCSET_PATH

In [7]:
VEGA_LITE_DOCSET = f"{DOCSET_NAME}.docset"
PLIST_PATH = f"{DOCSET_NAME}.docset/Contents/Info.plist"
SQLITE_PATH = f"{DOCSET_NAME}.docset/Contents/Resources/docSet.dsidx"

In [8]:
# Create Info.plist
docsetIndex = '''<?xml version="1.0" encoding="UTF-8"?>
<plist version="1.0">
  <dict>
    <key>CFBundleIdentifier</key>
    <string>vega-lite</string>
    <key>CFBundleName</key>
    <string>Vega-Lite</string>
    <key>DocSetPlatformFamily</key>
    <string>vega-lite</string>
    <key>isDashDocset</key>
    <true />
    <key>isJavaScriptEnabled</key>
    <true/>
    <key>dashIndexFilePath</key>
    <string>./index.html</string>
    <key>DashDocSetFallbackURL</key>
    <string>https://vega.github.io/vega-lite/</string>
    <key>DashDocSetPlayURL</key>
    <string>https://vega.github.io/editor/#/custom/vega-lite</string>
    <key>DashDocSetFamily</key>
<string>dashtoc</string>
  </dict>
</plist>
'''

with open('vega-lite.docset/Contents/Info.plist', 'w+') as fp:
    fp.write(docsetIndex)

In [9]:

# string to handle with replacement
DOCSET_BASE = 'DOCSET_BASE_TO_REPLACE'

def getDepthString(depth):
    return '.' + depth * "/.."


def update_relative_paths(filename):
    with open(filename) as fp:
        text = fp.read()
        
    depth = len(filename.split('/')) - 2
    depthString = getDepthString(depth)

    return text.replace(DOCSET_BASE, depthString)

In [10]:
# Convert absolute to relative URLs
matches = glob.glob(f"vega-lite-docs/**/*.html", recursive=True)
print(len(matches))
# now let's clean up the source files
for name in matches:
    if DEBUG:
        print('Moving relative urls', name)
    processed = update_relative_paths(name)
    with open(name, 'w') as fp:
        fp.write(processed)

258
Moving relative urls vega-lite-docs/index.html
Moving relative urls vega-lite-docs/ecosystem.html
Moving relative urls vega-lite-docs/applications.html
Moving relative urls vega-lite-docs/comparison.html
Moving relative urls vega-lite-docs/demo.html
Moving relative urls vega-lite-docs/usage/compile.html
Moving relative urls vega-lite-docs/usage/typescript.html
Moving relative urls vega-lite-docs/usage/debugging.html
Moving relative urls vega-lite-docs/usage/embed.html
Moving relative urls vega-lite-docs/docs/sort.html
Moving relative urls vega-lite-docs/docs/legend.html
Moving relative urls vega-lite-docs/docs/mark.html
Moving relative urls vega-lite-docs/docs/size.html
Moving relative urls vega-lite-docs/docs/bin.html
Moving relative urls vega-lite-docs/docs/datum.html
Moving relative urls vega-lite-docs/docs/errorbar.html
Moving relative urls vega-lite-docs/docs/format.html
Moving relative urls vega-lite-docs/docs/text.html
Moving relative urls vega-lite-docs/docs/image.html
Movi

Moving relative urls vega-lite-docs/examples/stacked_bar_weather.html
Moving relative urls vega-lite-docs/examples/layer_bar_annotations.html
Moving relative urls vega-lite-docs/examples/trellis_bar_histogram.html
Moving relative urls vega-lite-docs/examples/trail_color.html
Moving relative urls vega-lite-docs/examples/area.html
Moving relative urls vega-lite-docs/examples/circle.html
Moving relative urls vega-lite-docs/examples/stacked_bar_count_corner_radius_mark.html
Moving relative urls vega-lite-docs/examples/bar_aggregate.html
Moving relative urls vega-lite-docs/examples/geo_trellis.html
Moving relative urls vega-lite-docs/examples/interactive_global_development.html
Moving relative urls vega-lite-docs/examples/window_percent_of_total.html
Moving relative urls vega-lite-docs/examples/trail_comet.html
Moving relative urls vega-lite-docs/examples/layer_line_errorband_ci.html
Moving relative urls vega-lite-docs/examples/circle_binned.html
Moving relative urls vega-lite-docs/examples

In [11]:
# Then get entries to add to the database
def get_soup(filename):
    with open(filename) as fp:
        text = fp.read()
    return BeautifulSoup(text, 'lxml')

In [12]:
def get_guide_entries():
    tutorial_soup = get_soup('vega-lite-docs/tutorials/getting_started.html')
    tutorial_links = tutorial_soup.find('section', {"class": "tutorials"}).find('nav').findAll('a')
    tutorial_entries = [
        [l.getText().strip(),
        'Guide',
        os.path.relpath(l['href'], '../')]
        for l in tutorial_links  
    ]
    
    usage_soup = get_soup('vega-lite-docs/usage/embed.html')
    usage_links = usage_soup.find('section', {"class": "usage"}).find('nav').findAll('a')
    usage_entries = [
        [f"Usage: {l.getText().strip()}",
        'Guide',
        os.path.relpath(l['href'], '../')]
        for l in usage_links  
    ]
    
    
    return [
        *tutorial_entries,
        *usage_entries,
        
        # other special entries
        ['Ecosystem', 'Guide', '/ecosystem.html'],
        ['Comparison', 'Guide', '/comparison.html'],
        #  ['Overview', 'Guide', '/index.html'],   # not needed, is main page
    ]

In [13]:
def get_sample_entries():
    examples_soup = get_soup('vega-lite-docs/examples/index.html')
    links = examples_soup.find('section', {"class": "examples"}).findAll('a', { "class": "imagegroup" })
    
    entries = [
        [f"{l.getText().strip()}",
        'Sample',
        os.path.relpath(l['href'], '../')]
        for l in links  
    ]
    
    return entries

In [14]:
# hardcoded remappings based on user testing, these can be revised later
# beautifulsoup tag
def clean_property_name(bsTag):
    maybeName = bsTag.getText().strip()
    
    if maybeName == 'Width / Height':
        return 'Size'
    
    return maybeName
    
def clean_category_name(bsTag):
    maybeName = " ".join(bsTag.getText().split("\w+/")).lower().strip()
    
    if maybeName == 'view specification':
        return 'view'
    
    return maybeName

In [15]:
def get_structures_entries():
    # TODO: see if this should get more granular eventually
    # decide on difference between top level spec and sub level properties
    docs_soup = get_soup('vega-lite-docs/docs/index.html')
    sidebar = docs_soup.find('aside', {"class": "page-sidebar"}).find('ul')
    cat_links = sidebar.findAll('li', { "class": "sidebar-nav-item" }, recursive=False) # don't go into sublists
    
    cat_entries = [
        [f"{l.getText().strip()}",
        'Category',
        os.path.relpath(l.findNext('a')['href'], '../')]
        for l in cat_links  
    ]
    
    # Children of each category
    structure_entries = []
    for category in cat_links:
        category_prefix = clean_category_name(category)
        
        sublist = None
        for sibling in category.next_siblings:
            if type(sibling) == NavigableString:
                continue
            sublist = sibling
            break
            
        # handle the random BR tags hanging out
        if sublist is None or sublist.name == 'br':
            continue
            

        if (sublist['class'] == ['sub-sub-nav']):
            category_properties = sublist.findAll('a')

            structure_entries.extend([
                [f"{category_prefix}.{clean_property_name(l)}",
                'Property',
                os.path.relpath(l['href'], '../')]
                for l in category_properties  
            ])
            
    return [
        *cat_entries,
        *structure_entries
    ]

In [16]:
all_entries = [*get_guide_entries(), *get_sample_entries(), *get_structures_entries()]

In [17]:
# Add 1 row
def addRow(cursor, name, rowType, path):
    statement = f"INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES ('{name}', '{rowType}', '{path}');"
    if DEBUG:
        print(statement)
    cursor.execute(statement)

In [18]:
try:
    # TODO: evaluate if this needs to become indempotent
    connection = sqlite3.connect(SQLITE_PATH)
    cursor = connection.cursor()
    
    # Uncomment these on first run
    #     cursor.execute('CREATE TABLE searchIndex(id INTEGER PRIMARY KEY, name TEXT, type TEXT, path TEXT);')
    #     cursor.execute('CREATE UNIQUE INDEX anchor ON searchIndex (name, type, path);')

    # Debug row
    #     addRow(cursor, 'Getting Started' ,'Guide', '/tutorials/getting_started.html#tutorial-overview')
    for entry in all_entries:
        addRow(cursor, *entry)
    
    connection.commit()
    
except sqlite3.Error as error:
    print("Error while executing sqlite script", error)
    
finally:
    if connection:
        connection.close()
        print('connection closed')

INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES ('Getting Started', 'Guide', 'tutorials/getting_started.html');
INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES ('Exploring Data', 'Guide', 'tutorials/explore.html');
INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES ('Paper Figures', 'Guide', 'tutorials/figures.html');
INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES ('Streaming Data', 'Guide', 'tutorials/streaming.html');
INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES ('Usage: Embed', 'Guide', 'usage/embed.html');
INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES ('Usage: Compile', 'Guide', 'usage/compile.html');
INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES ('Usage: TypeScript', 'Guide', 'usage/typescript.html');
INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES ('Usage: Debugging', 'Guide', 'usage/debugging.html');
INSERT OR IGNORE INTO searchIndex(name, type, path) VALUES ('Ecosystem', 'Guide', '/ecos

In [19]:
# logos from https://github.com/vega/logos
#!curl https://github.com/vega/logos/raw/master/assets/VL_Color%40128.png > 
!cp img/* $VEGA_LITE_DOCSET/

In [20]:
def enrich_table_of_contents(filename):
    with open(filename) as fp:
        text = fp.read()
        
    # get the soup
    soup = BeautifulSoup(text, 'lxml')
    content = soup.find('section', {"class": 'page-content'}) or soup.find('div', {"class": 'page-content'})
    
    if content is None:
        return text
        
    category_anchors = content.findAll('h2')
    for anchor in category_anchors:

        if anchor.find('a'):
            continue
        # safe quote to handle / path characters
        # https://stackoverflow.com/questions/1695183/how-to-percent-encode-url-parameters-in-python
        safeName = quote(anchor.getText().strip(), safe='')
        dashAnchor = soup.new_tag('a')
        dashAnchor['name'] = f'//apple_ref/cpp/Category/{safeName}'
        dashAnchor['class'] = 'dashAnchor'
        anchor.append(dashAnchor)
    
    section_anchors = content.findAll('h3')
    for anchor in section_anchors:

        if anchor.find('a'):
            continue
            
        safeName = quote(anchor.getText().strip(), safe='')
        dashAnchor = soup.new_tag('a')
        dashAnchor['name'] = f'//apple_ref/cpp/Section/{safeName}'
        dashAnchor['class'] = 'dashAnchor'
        anchor.append(dashAnchor)

        
    print("Added", len(section_anchors), 'sections', len(category_anchors), 'categories')
    
    return str(soup)
    

In [21]:
# URL encode strings
#quote("Programming / Data", safe='')

In [22]:
def remove_page_elements(filename):
    with open(filename) as fp:
        text = fp.read()

    # remove sidebar nav elements
    soup = BeautifulSoup(text, 'lxml')
    sidebar = soup.find('aside', {"class": "page-sidebar"})
    
    if sidebar:
        print("removing sidebar from ", filename)
        sidebar.decompose()
    
    return str(soup)

In [23]:
# Add "toc" entries for each page.
matches = glob.glob(f"vega-lite-docs/**/*.html", recursive=True)
print(len(matches))

for name in matches:
    if DEBUG:
        print('Enrich table of contents', name)
    processed = enrich_table_of_contents(name)
    
    with open(name, 'w') as fp:
        fp.write(processed)

258
Enrich table of contents vega-lite-docs/index.html
Added 0 sections 4 categories
Enrich table of contents vega-lite-docs/ecosystem.html
Added 0 sections 6 categories
Enrich table of contents vega-lite-docs/applications.html
Enrich table of contents vega-lite-docs/comparison.html
Added 0 sections 3 categories
Enrich table of contents vega-lite-docs/demo.html
Enrich table of contents vega-lite-docs/usage/compile.html
Added 4 sections 2 categories
Enrich table of contents vega-lite-docs/usage/typescript.html
Added 0 sections 0 categories
Enrich table of contents vega-lite-docs/usage/debugging.html
Added 0 sections 4 categories
Enrich table of contents vega-lite-docs/usage/embed.html
Added 3 sections 2 categories
Enrich table of contents vega-lite-docs/docs/sort.html
Added 5 sections 3 categories
Enrich table of contents vega-lite-docs/docs/legend.html
Added 6 sections 4 categories
Enrich table of contents vega-lite-docs/docs/mark.html
Added 6 sections 5 categories
Enrich table of cont

Enrich table of contents vega-lite-docs/examples/bar_diverging_stack_population_pyramid.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/area_density_stacked.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/layer_line_mean_point_raw.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/index.html
Added 16 sections 7 categories
Enrich table of contents vega-lite-docs/examples/repeat_layer.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/point_quantile_quantile.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/line_conditional_axis.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/interactive_seattle_weather.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/layer_bar_fruit.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples

Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/interactive_global_development.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/window_percent_of_total.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/trail_comet.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/layer_line_errorband_ci.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/circle_binned.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/circle_natural_disasters.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/isotype_bar_chart_emoji.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/trellis_anscombe.html
Added 1 sections 0 categories
Enrich table of contents vega-lite-docs/examples/layer_line_rolling_mean_point_raw.html
Added 1 sections 0 categories
Enrich ta

In [24]:
matches = glob.glob(f"vega-lite-docs/**/*.html", recursive=True)
if DEBUG:
    print('Files', len(matches))
for name in matches:    
    # https://stackoverflow.com/questions/5598524/can-i-remove-script-tags-with-beautifulsoup
    processed = remove_page_elements(name)
    
    with open(name, 'w') as fp:
        fp.write(processed)

Files 258
removing sidebar from  vega-lite-docs/docs/sort.html
removing sidebar from  vega-lite-docs/docs/legend.html
removing sidebar from  vega-lite-docs/docs/mark.html
removing sidebar from  vega-lite-docs/docs/size.html
removing sidebar from  vega-lite-docs/docs/bin.html
removing sidebar from  vega-lite-docs/docs/datum.html
removing sidebar from  vega-lite-docs/docs/errorbar.html
removing sidebar from  vega-lite-docs/docs/format.html
removing sidebar from  vega-lite-docs/docs/text.html
removing sidebar from  vega-lite-docs/docs/image.html
removing sidebar from  vega-lite-docs/docs/density.html
removing sidebar from  vega-lite-docs/docs/band.html
removing sidebar from  vega-lite-docs/docs/window.html
removing sidebar from  vega-lite-docs/docs/label.html
removing sidebar from  vega-lite-docs/docs/index.html
removing sidebar from  vega-lite-docs/docs/geoshape.html
removing sidebar from  vega-lite-docs/docs/impute.html
removing sidebar from  vega-lite-docs/docs/joinaggregate.html
remov

In [25]:
#!rsync -r ../vega-lite/site/_site/* ./vega-lite-docs

In [26]:
# Move HTML into the right directory
#!rsync -r ../vega-lite/site/_site/* ./vega-lite-docs
# exclude static data files since dash doesn't run a local fileserver
!rsync -r ./vega-lite-docs/* $DOCSET_PATH \
     --exclude 'data/*.csv' --exclude 'data/*.arrow' --exclude 'data/*.json' --exclude 'data/*.tsv' \
     --exclude 'sitemap.xml' --exclude 'redirects.json' --exclude 'rollup.config.js' --exclude 'applications.html'

In [27]:
# Prepare to publish in adjacent user contribs repository
# tar --exclude='.DS_Store' -cvzf vega-lite.tgz vega-lite.docset
# cp vega-lite.tgz ../Dash-User-Contributions/docsets/Vega-Lite/