Skip to content

Commit

Permalink
speedup index generation, takes 1s instead of 5 now
Browse files Browse the repository at this point in the history
  • Loading branch information
karlicoss committed Jan 10, 2024
1 parent 6d7cdbc commit 6b17242
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 23 deletions.
15 changes: 4 additions & 11 deletions exobrain/src/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,17 +427,10 @@ def publish_html(cfg: Config, *, pool: Executor) -> None:
# fmt: on

# todo eh.. implement this as an external site agnostic script
logger.debug('generating index') # note: takes about 5 secs atm
(html_dir / 'documents.js').write_text(
check_output(
[
src / 'search/makeindex.py',
'--root',
html_dir,
],
text=True,
)
)
logger.debug('generating index')
from search.makeindex import make_index
index_js = make_index(root=html_dir, pool=pool)
(html_dir / 'documents.js').write_text(index_js)
logger.debug('generating index: done')

if (html_dir / 'README.html').exists(): # meh, for incremental mode
Expand Down
45 changes: 33 additions & 12 deletions exobrain/src/search/makeindex.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,22 @@
#!/usr/bin/env python3
from __future__ import annotations

from concurrent.futures import Executor
import json
from pathlib import Path
import re
from typing import Tuple, Iterable

import bs4 # type: ignore
import click
import bs4 # type: ignore[import]
from more_itertools import divide


File = str
Id = str
Text = str


def walk(node) -> Iterable[Tuple[Id, Text]]:
id_ = node.get('id')
if id_ in {
Expand All @@ -35,9 +42,8 @@ def walk(node) -> Iterable[Tuple[Id, Text]]:
node.decompose() # prevent from being processed by the parent


def walk_all(root: Path) -> Iterable[Tuple[File, Id, Text]]:
htmls = list(sorted(root.rglob('*.html')))
assert len(htmls) > 0, root
def walk_batch(htmls: list[Path], root: Path):
ress = []
for html in htmls:
if html.name == 'sitemap.html':
continue
Expand All @@ -46,25 +52,40 @@ def walk_all(root: Path) -> Iterable[Tuple[File, Id, Text]]:
soup = bs4.BeautifulSoup(html.read_text(), 'lxml').find('body')
rpath = html.relative_to(root)
for key, res in walk(soup):
yield (str(rpath), key, res)
ress.append((str(rpath), key, res))
return ress


def run(root: Path) -> None:
import json
def walk_all(*, root: Path, pool: Executor) -> Iterable[Tuple[File, Id, Text]]:
htmls = sorted(root.rglob('*.html'))
assert len(htmls) > 0, root

workers = pool._max_workers # type: ignore[attr-defined]
groups = [list(group) for group in divide(workers, htmls)]
futures = [pool.submit(walk_batch, group, root) for group in groups]
for group, fut in zip(groups, futures):
try:
yield from fut.result()
except Exception as e:
raise RuntimeError(f'error while processing {group}') from e


def make_index(*, root: Path, pool: Executor) -> str:
documents = [{
'file': file,
'id' : id,
'text': text,
} for file, id, text in walk_all(root)]
print('/* AUTOGENERATED by makeindex.py */')
print('let documents = ' + json.dumps(documents, indent=1, ensure_ascii=False))
} for file, id, text in walk_all(root=root, pool=pool)]
return f'''
/* AUTOGENERATED by makeindex.py */
let documents = {json.dumps(documents, indent=1, ensure_ascii=False)}
'''.lstrip()


import click
@click.command()
@click.option('--root', type=Path, required=True)
def main(root: Path) -> None:
run(root)
print(make_index(root=root, pool=None)) # type: ignore[arg-type]


if __name__ == '__main__':
Expand Down

0 comments on commit 6b17242

Please sign in to comment.