In [7]:
import base64
import json
import os
from datetime import datetime
from pathlib import Path

import structlog
from structlog.stdlib import BoundLogger

from scrayping.github_api import ContentData
from scrayping.github_api import FileInfo
from scrayping.github_api import GithubApiManager
from scrayping.github_api import GithubApiUrl
from scrayping.github_api import RepositoryInfo
from utils.logger_config import configure_logger

BOOK_DIR = os.environ.get("BOOK_DIR", "/books")


def save_chunk(
    data: list[FileInfo] | ContentData | list[RepositoryInfo], path: Path, logger: BoundLogger
) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as fp:
        json.dump(data, fp)
    if logger:
        logger.info("Saved data.", path=path)


def save_xhtml(data: str, path: Path, logger: BoundLogger) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as fp:
        fp.write(data)
    if logger:
        logger.info("Saved data.", path=path)


def read_dict(path, logger=None) -> list[FileInfo | RepositoryInfo]:
    if logger:
        logger.info("Reading data.", path=path)
    with open(path) as fp:
        return json.load(fp)


def build_text_file_tree_url(book_name: str) -> GithubApiUrl:
    TEXT_FILE_DIR = "src/epub/text"
    BASE_URL = f"https://api.github.com/repos/standardebooks/{book_name}/git/trees/master"
    return BASE_URL + ":" + TEXT_FILE_DIR


def build_toc_file_url(book_name: str) -> GithubApiUrl:
    TOC_FILE_DIR = "src/epub/toc.xhtml"
    BASE_URL = f"https://api.github.com/repos/standardebooks/{book_name}/git/trees/master"
    return BASE_URL + ":" + TOC_FILE_DIR


def tree_info_path(book_name: str) -> Path:
    return tarfetch_book_dir(book_name) / "info.json"


def tarfetch_book_dir(book_name: str) -> Path:
    return Path(f"{BOOK_DIR}/{book_name}")


def scrape_files(
    book_name: str, force=False, logger: BoundLogger = structlog.get_logger(__name__)
) -> None:
    tree_info_chunk = read_dict(tree_info_path(book_name), logger)

    for each_file_info in tree_info_chunk:
        github = GithubApiManager()
        try:
            # EXAMPLE: books/john-maynard-keynes_the-economic-consequences-of-the-peace/chapter-1.xhtml
            tarfetch_path = tarfetch_book_dir(book_name) / each_file_info["path"]
            if not force and tarfetch_path.exists():
                logger.info(
                    "File already exists.",
                    path=tarfetch_path,
                    book_name=book_name,
                    fiil_name=tarfetch_path.name,
                )
                continue
            url = github.valivade_url(each_file_info["url"], "standardebooks", book_name)
            content_data = github.fetch_single_file_content_data(url)
            save_xhtml(
                base64.b64decode(content_data["content"]).decode("utf-8"), tarfetch_path, logger
            )
        except Exception as e:
            logger.exception(
                "Failed to process file.",
                at="scrape_files",
                file_path=each_file_info["path"],
                error=str(e),
            )


def save_tree_info(
    book_name: str, force=False, logger: BoundLogger = structlog.get_logger(__name__)
) -> None:
    if not force and tree_info_path(book_name).exists():
        logger.info(
            "Tree_info already exists.", path=tree_info_path(book_name), book_name=book_name
        )
        return
    text_file_tree_url = build_text_file_tree_url(book_name)
    save_chunk(
        GithubApiManager().fetch_file_tree_info(text_file_tree_url),
        tree_info_path(book_name),
        logger,
    )
    return


def fetch_raw_toc_file(
    book_name: str, force: bool = False, logger: BoundLogger = structlog.get_logger(__name__)
) -> None:
    if not force and Path(f"{BOOK_DIR}/{book_name}/toc.xhtml").exists():
        logger.info("toc.xhtml already exists.", book_name=book_name)
        return
    toc_file_url = build_toc_file_url(book_name)
    toc_file_info = GithubApiManager().fetch_single_file_content_data(toc_file_url)
    toc_file_info["content"] = base64.b64decode(toc_file_info["content"]).decode("utf-8")

    save_chunk(toc_file_info, Path(f"{BOOK_DIR}/{book_name}/toc_file_info.json"), logger)
    save_xhtml(toc_file_info["content"], Path(f"{BOOK_DIR}/{book_name}/toc.xhtml"), logger)


def fetch_all_repositories(
    force=False, logger: BoundLogger = structlog.get_logger(__name__)
) -> None:
    today = datetime.now().strftime("%Y-%m-%d")
    if not force and Path(f"{BOOK_DIR}/{today}_standardebooks_repositories.json").exists():
        return
    github = GithubApiManager()
    repositories = github.fetch_all_user_repositories("standardebooks")
    save_chunk(repositories, Path(f"{BOOK_DIR}/{today}_standardebooks_repositories.json"), logger)


def fetch_30_repositories(
    logger: BoundLogger = structlog.get_logger(__name__),
) -> list[RepositoryInfo]:
    file_path = Path(f"{BOOK_DIR}/trial_standardebooks_repositories.json")
    if file_path.exists():
        return read_dict(Path(f"{BOOK_DIR}/trial_standardebooks_repositories.json"), logger)  # type: ignore
    github = GithubApiManager()
    repositories = github.fetch_user_repositories("standardebooks")
    save_chunk(repositories, file_path, logger)

    # import pprint
    # pprint.pprint(repositories)
    return repositories


def fetch_book_data(book_name: str, logger: BoundLogger = structlog.get_logger(__name__)) -> None:
    fetch_raw_toc_file(book_name, False, logger)
    save_tree_info(book_name, False, logger)
    scrape_files(book_name, False, logger)


def main():
    configure_logger()

    logger = structlog.get_logger(__name__)

    repos = fetch_30_repositories(logger)
    for repo in repos:
        book_name = repo["name"]
        print(book_name)
        fetch_book_data(book_name, logger)

    # save_chunk(repositories, Path(f"{BOOK_DIR}/{today}_standardebooks_repositories.json"), logger)

    # fetch_all_repositories(False, logger)


if __name__ == "__main__":
    main()

[2m2024-04-12 10:09.35[0m [[32m[1minfo     [0m] [1mAPI Request                   [0m [[0m[1m[34mscrayping.github_api[0m][0m [36mmethod[0m=[35mGET[0m [36mmodule[0m=[35mgithub_api[0m [36murl[0m=[35mhttps://api.github.com/users/standardebooks/repos[0m
[2m2024-04-12 10:09.35[0m [[32m[1minfo     [0m] [1mAPI Request                   [0m [[0m[1m[34mscrayping.github_api[0m][0m [36mmethod[0m=[35mGET[0m [36mmodule[0m=[35mgithub_api[0m [36murl[0m=[35mhttps://api.github.com/users/standardebooks/repos[0m
[2m2024-04-12 10:09.35[0m [[32m[1minfo     [0m] [1mAPI Request                   [0m [[0m[1m[34mscrayping.github_api[0m][0m [36mmethod[0m=[35mGET[0m [36mmodule[0m=[35mgithub_api[0m [36murl[0m=[35mhttps://api.github.com/users/standardebooks/repos[0m
[1mStarting new HTTPS connection (1): api.github.com:443[0m
[1mStarting new HTTPS connection (1): api.github.com:443[0m
[1mStarting new HTTPS connection (1): api.github.com:443

PermissionError: [Errno 13] Permission denied: '/books'

In [6]:
configure_logger()

logger = structlog.get_logger(__name__)

repos = fetch_30_repositories(logger)
for repo in repos:
    book_name = repo["name"]
    fetch_raw_toc_file(book_name, False, logger)
    save_tree_info(book_name, False, logger)


[2m2024-04-12 09:51.04[0m [[32m[1minfo     [0m] [1mAPI Request                   [0m [[0m[1m[34mscrayping.github_api[0m][0m [36mmethod[0m=[35mGET[0m [36mmodule[0m=[35mgithub_api[0m [36murl[0m=[35mhttps://api.github.com/users/standardebooks/repos[0m
[2m2024-04-12 09:51.04[0m [[32m[1minfo     [0m] [1mAPI Request                   [0m [[0m[1m[34mscrayping.github_api[0m][0m [36mmethod[0m=[35mGET[0m [36mmodule[0m=[35mgithub_api[0m [36murl[0m=[35mhttps://api.github.com/users/standardebooks/repos[0m
[1mStarting new HTTPS connection (1): api.github.com:443[0m
[1mStarting new HTTPS connection (1): api.github.com:443[0m
[1mhttps://api.github.com:443 "GET /users/standardebooks/repos HTTP/1.1" 200 None[0m
[1mhttps://api.github.com:443 "GET /users/standardebooks/repos HTTP/1.1" 200 None[0m
[2m2024-04-12 09:51.05[0m [[32m[1mdebug    [0m] [1mAPI Request Successful        [0m [[0m[1m[34mscrayping.github_api[0m][0m [36mmodule[0m=[35

KeyError: 'content'

In [None]:
https://api.github.com/repos/standardebooks/john-maynard-keynes_the-economic-consequences-of-the-peace/git/trees/master:src/epub/text\"
https://api.github.com/repos/standardebooks/a-a-milne_the-red-house-myster                           y/git/trees/master:src/epub/toc.xhtml