In [85]:
import base64
import json
import os
from datetime import datetime
from pathlib import Path

import structlog
from structlog.stdlib import BoundLogger

from scrayping.github_api import ContentData
from scrayping.github_api import FileInfo
from scrayping.github_api import GithubApiManager
from scrayping.github_api import GithubApiUrl
from scrayping.github_api import RepositoryInfo
from scrayping.github_api import build_github_file_api
from scrayping.github_api import build_github_tree_api
from utils.data_io import read_dict
from utils.data_io import save_chunk
from utils.data_io import save_xhtml
from utils.logger_config import configure_logger

BOOK_DIR = Path(os.environ.get("BOOK_DIR", "/books"))


In [86]:
list(BOOK_DIR.glob("*"))

[PosixPath('/home/user/dev/kasi-x/akizora/books/aleksandr-kuprin_yama_bernard-guilbert-guerney'),
 PosixPath('/home/user/dev/kasi-x/akizora/books/agatha-christie_the-mystery-of-the-blue-train'),
 PosixPath('/home/user/dev/kasi-x/akizora/books/agatha-christie_the-man-in-the-brown-suit'),
 PosixPath('/home/user/dev/kasi-x/akizora/books/alexandre-dumas_the-three-musketeers_william-robson'),
 PosixPath('/home/user/dev/kasi-x/akizora/books/aldous-huxley_those-barren-leaves'),
 PosixPath('/home/user/dev/kasi-x/akizora/books/agatha-christie_the-murder-on-the-links'),
 PosixPath('/home/user/dev/kasi-x/akizora/books/aleksandr-kuprin_the-duel_george-allen-unwin'),
 PosixPath('/home/user/dev/kasi-x/akizora/books/aldous-huxley_crome-yellow'),
 PosixPath('/home/user/dev/kasi-x/akizora/books/aleksandr-kuprin_short-fiction_s-koteliansky_j-m-murry_stephen-graham_rosa-savory-graham_leo-pasvols'),
 PosixPath('/home/user/dev/kasi-x/akizora/books/a-w-tozer_the-pursuit-of-god'),
 PosixPath('/home/user/dev/

In [87]:
chapters[0]

{'title': 'Titlepage',
 'href': 'text/titlepage.xhtml',
 'nest_level': 0,
 'span': '',
 'query': "//html/body/nav[@id='toc'][1]/ol/li[1]",
 'url': '',
 'subchapters': []}

In [88]:
def get_max_nest_level(chapters):
    for chapter in chapters:
        if sub_chapters := chapter.get("subchapters"):
            get_max_nest_level(sub_chapters)
        else:
            print(chapter.get("title", -1))
            print(chapter.get("nest_level", -1))

In [89]:
for repo_dir in BOOK_DIR.glob("*"):
    if not repo_dir.is_dir() or repo_dir.name == ".ipynb_checkpoints":
        continue
    chapters = read_dict(repo_dir/'toc.json')
    print('-'* 30)
    print(repo_dir.name)
    get_max_nest_level(chapters)

------------------------------
aleksandr-kuprin_yama_bernard-guilbert-guerney
Titlepage
0
Imprint
0
Epigraph
0
Author’s Dedication
0
Translator’s Dedication
0
Introduction
0
Translator’s Note
0
I
2
II
2
III
2
IV
2
V
2
VI
2
VII
2
VIII
2
IX
2
X
2
XI
2
XII
2
XIII
2
I
2
II
2
III
2
IV
2
V
2
VI
2
VII
2
VIII
2
IX
2
X
2
XI
2
XII
2
XIII
2
XIV
2
XV
2
XVI
2
XVII
2
I
2
II
2
III
2
IV
2
V
2
VI
2
VII
2
VIII
2
IX
2
Author’s Postscript
0
Endnotes
0
List of Illustrations
0
Colophon
0
Uncopyright
0
------------------------------
agatha-christie_the-mystery-of-the-blue-train
Titlepage
0
Imprint
0
Dedication
0
I: The Man with the White Hair
1
II: 
1
III: Heart of Fire
1
IV: In Curzon Street
1
V: A Useful Gentleman
1
VI: Mirelle
1
VII: Letters
1
VIII: Lady Tamplin Writes a Letter
1
IX: An Offer Refused
1
X: On the Blue Train
1
XI: Murder
1
XII: At the Villa Marguerite
1
XIII: Van Aldin Gets a Telegram
1
XIV: Ada Mason’s Story
1
XV: The Comte de la Roche
1
XVI: Poirot Discusses the Case
1
XVII: An Aristocrat

In [90]:
'''
aleksandr-kuprin_short-fiction_s-koteliansky_j-m-murry_stephen-graham_rosa-savory-graham_leo-pasvols
alexander-pushkin_eugene-onegin_henry-spalding
adam-smith_the-wealth-of-nations
'''

'\naleksandr-kuprin_short-fiction_s-koteliansky_j-m-murry_stephen-graham_rosa-savory-graham_leo-pasvols\nalexander-pushkin_eugene-onegin_henry-spalding\nadam-smith_the-wealth-of-nations\n'

In [94]:
repo_dir = BOOK_DIR/'adam-smith_the-wealth-of-nations'
toc = read_dict(repo_dir/'toc.json')

print('-'* 30)
from pprint import pprint
pprint(toc)っ

------------------------------
[{'href': 'text/titlepage.xhtml',
  'name': 'Titlepage',
  'nest_level': 0,
  'number': '',
  'query': "//html/body/nav[@id='toc'][1]/ol/li[1]",
  'subchapters': [],
  'title': 'Titlepage',
  'url': ''},
 {'href': 'text/imprint.xhtml',
  'name': 'Imprint',
  'nest_level': 0,
  'number': '',
  'query': "//html/body/nav[@id='toc'][1]/ol/li[2]",
  'subchapters': [],
  'title': 'Imprint',
  'url': ''},
 {'href': 'text/editors-introduction.xhtml',
  'name': 'Editor’s Introduction',
  'nest_level': 0,
  'number': '',
  'query': "//html/body/nav[@id='toc'][1]/ol/li[3]",
  'subchapters': [],
  'title': 'Editor’s Introduction',
  'url': ''},
 {'href': 'text/introduction.xhtml',
  'name': 'Introduction and Plan of the Work',
  'nest_level': 0,
  'number': '',
  'query': "//html/body/nav[@id='toc'][1]/ol/li[4]",
  'subchapters': [],
  'title': 'Introduction and Plan of the Work',
  'url': ''},
 {'href': 'text/halftitlepage.xhtml',
  'name': 'The Wealth of Nations',
