In [None]:
# | default_exp _index_file_generator

In [None]:
# | export

from typing import *

import typer
from llama_index.readers.schema.base import Document
from llama_index import download_loader, GPTSimpleVectorIndex

from fastkafkachat._helper import get_all_links_from_website, get_service_context, write_compressed_json

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import time
from tempfile import TemporaryDirectory
from pathlib import Path

from typer.testing import CliRunner

from fastkafkachat._helper import load_compressed_json

In [None]:
runner = CliRunner()

In [None]:
# | export


def _index_website_data(
    start_url: str = "https://fastkafka.airt.ai",
    data_dir: str = "./data",
) -> None:
    """Extract and index website data from the given start URL.

    Args:
        start_url: The starting URL of the website. Defaults to "https://fastkafka.airt.ai".
        data_dir: The data directory path to save the index file generated by parsing all the website links.
    """
    urls = list(get_all_links_from_website(start_url))
    
    typer.echo("\nIndexing the contents from the following URL's: \n")
    typer.echo("\n".join(sorted(urls)))
    BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
    loader = BeautifulSoupWebReader()
    documents = loader.load_data(urls=urls)

    service_context = get_service_context()
    index = GPTSimpleVectorIndex.from_documents(
        documents, service_context=service_context
    )
    write_compressed_json(index.save_to_string(), f"{data_dir}/website_index.json")
    typer.echo("\nIndexing successfully completed.")

In [None]:
with TemporaryDirectory() as d:
    data_path = Path(d) / "data"
    data_path.mkdir(parents=True)
    
    start_time = time.time()
    data_dir=f"{d}/data/" #website_index.json.gz
    documents = _index_website_data(data_dir=data_dir)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time} seconds")
    
    index_json = load_compressed_json(f"{d}/data/website_index.json.gz")
    index = GPTSimpleVectorIndex.load_from_string(index_json)
    print(index)
    assert type(index) == GPTSimpleVectorIndex

Indexing the contents from the following URL's: 

https://fastkafka.airt.ai
https://fastkafka.airt.ai/docs
https://fastkafka.airt.ai/docs/CHANGELOG
https://fastkafka.airt.ai/docs/api/fastkafka
https://fastkafka.airt.ai/docs/api/fastkafka/KafkaEvent
https://fastkafka.airt.ai/docs/api/fastkafka/testing/ApacheKafkaBroker
https://fastkafka.airt.ai/docs/api/fastkafka/testing/LocalRedpandaBroker
https://fastkafka.airt.ai/docs/api/fastkafka/testing/Tester
https://fastkafka.airt.ai/docs/cli/fastkafka
https://fastkafka.airt.ai/docs/cli/run_fastkafka_server_process
https://fastkafka.airt.ai/docs/guides/Guide_04_Github_Actions_Workflow
https://fastkafka.airt.ai/docs/guides/Guide_05_Lifespan_Handler
https://fastkafka.airt.ai/docs/guides/Guide_06_Benchmarking_FastKafka
https://fastkafka.airt.ai/docs/guides/Guide_07_Encoding_and_Decoding_Messages_with_FastKafka
https://fastkafka.airt.ai/docs/guides/Guide_11_Consumes_Basics
https://fastkafka.airt.ai/docs/guides/Guide_21_Produces_Basics
https://fastka

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 94770 tokens




Indexing successfully completed.
Elapsed time: 54.0496711730957 seconds
<llama_index.indices.vector_store.vector_indices.GPTSimpleVectorIndex object>


In [None]:
# | export
_app = typer.Typer()


@_app.command(
    help="Recursively parses all anchor links found on the website and index website data from the given start URL",
)
def index_website_data(
    start_url: str = typer.Option(
        "https://fastkafka.airt.ai",
        help="The starting URL of the website",
    ),
    data_dir: str = typer.Option(
        "./data/",
        help="The data directory path to save the index file generated by parsing all the website links.",
    ),
) -> None:
    try:
        _index_website_data(
            start_url=start_url,
            data_dir=data_dir
        )
    except Exception as e:
        typer.secho(e, err=True, fg=typer.colors.RED)
        raise typer.Exit(1)

In [None]:
result = runner.invoke(_app, ["--help"])