speedup index generation, takes 1s instead of 5 now

karlicoss · Jan 10, 2024 · 6b17242 · 6b17242
1 parent 6d7cdbc
commit 6b17242
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 23 deletions.
diff --git a/exobrain/src/build.py b/exobrain/src/build.py
@@ -427,17 +427,10 @@ def publish_html(cfg: Config, *, pool: Executor) -> None:
     # fmt: on
 
     # todo eh.. implement this as an external site agnostic script
-    logger.debug('generating index')  # note: takes about 5 secs atm
-    (html_dir / 'documents.js').write_text(
-        check_output(
-            [
-                src / 'search/makeindex.py',
-                '--root',
-                html_dir,
-            ],
-            text=True,
-        )
-    )
+    logger.debug('generating index')
+    from search.makeindex import make_index
+    index_js = make_index(root=html_dir, pool=pool)
+    (html_dir / 'documents.js').write_text(index_js)
     logger.debug('generating index: done')
 
     if (html_dir / 'README.html').exists():  # meh, for incremental mode

diff --git a/exobrain/src/search/makeindex.py b/exobrain/src/search/makeindex.py
@@ -1,15 +1,22 @@
 #!/usr/bin/env python3
+from __future__ import annotations
+
+from concurrent.futures import Executor
+import json
 from pathlib import Path
 import re
 from typing import Tuple, Iterable
 
-import bs4 # type: ignore
+import click
+import bs4  # type: ignore[import]
+from more_itertools import divide
 
 
 File = str
 Id = str
 Text = str
 
+
 def walk(node) -> Iterable[Tuple[Id, Text]]:
     id_ = node.get('id')
     if id_ in {
@@ -35,9 +42,8 @@ def walk(node) -> Iterable[Tuple[Id, Text]]:
         node.decompose() # prevent from being processed by the parent
 
 
-def walk_all(root: Path) -> Iterable[Tuple[File, Id, Text]]:
-    htmls = list(sorted(root.rglob('*.html')))
-    assert len(htmls) > 0, root
+def walk_batch(htmls: list[Path], root: Path):
+    ress = []
     for html in htmls:
         if html.name == 'sitemap.html':
             continue
@@ -46,25 +52,40 @@ def walk_all(root: Path) -> Iterable[Tuple[File, Id, Text]]:
         soup = bs4.BeautifulSoup(html.read_text(), 'lxml').find('body')
         rpath = html.relative_to(root)
         for key, res in walk(soup):
-            yield (str(rpath), key, res)
+            ress.append((str(rpath), key, res))
+    return ress
 
 
-def run(root: Path) -> None:
-    import json
+def walk_all(*, root: Path, pool: Executor) -> Iterable[Tuple[File, Id, Text]]:
+    htmls = sorted(root.rglob('*.html'))
+    assert len(htmls) > 0, root
+
+    workers = pool._max_workers  # type: ignore[attr-defined]
+    groups = [list(group) for group in divide(workers, htmls)]
+    futures = [pool.submit(walk_batch, group, root) for group in groups]
+    for group, fut in zip(groups, futures):
+        try:
+            yield from fut.result()
+        except Exception as e:
+            raise RuntimeError(f'error while processing {group}') from e
+
+
+def make_index(*, root: Path, pool: Executor) -> str:
     documents = [{
         'file': file,
         'id'  : id,
         'text': text,
-    } for file, id, text in walk_all(root)]
-    print('/* AUTOGENERATED by makeindex.py */')
-    print('let documents = ' + json.dumps(documents, indent=1, ensure_ascii=False))
+    } for file, id, text in walk_all(root=root, pool=pool)]
+    return f'''
+/* AUTOGENERATED by makeindex.py */
+let documents = {json.dumps(documents, indent=1, ensure_ascii=False)}
+'''.lstrip()
 
 
-import click
 @click.command()
 @click.option('--root', type=Path, required=True)
 def main(root: Path) -> None:
-    run(root)
+    print(make_index(root=root, pool=None))  # type: ignore[arg-type]
 
 
 if __name__ == '__main__':