In [None]:
# | default_exp _helper

In [None]:
# | export


from pathlib import Path
from typing import *
import gzip
from urllib.request import Request, urlopen
from urllib.parse import urlparse, urljoin

from bs4 import BeautifulSoup
from langchain.chat_models import ChatOpenAI
from llama_index import (
    GPTSimpleVectorIndex,
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext,
)

In [None]:
from tempfile import TemporaryDirectory

In [None]:
# | export

def get_all_links_from_website(start_url: str, visited: Optional[set] = None) -> Set[str]:
    """Get a set of all links (URLs) found on the given website, starting from the given start URL.
    
    Args:
        start_url: The starting URL of the website.
        visited: Optional. A set of URLs that have already been visited. Defaults to an empty set.

    Returns:
        A set of all links found on the website.
    """
    if visited is None:
        visited = set()

    req = Request(start_url)
    html_page = urlopen(req)
    soup = BeautifulSoup(html_page, "lxml")
    
    base_url = urlparse(start_url).scheme + '://' + urlparse(start_url).hostname #type: ignore
    
    links = set()
    for link in soup.find_all('a', href=True):
        url = urljoin(base_url, link['href']).split("#")[0].strip("/")
        if urlparse(url).hostname == urlparse(start_url).hostname:
            links.add(url)
            
    visited.add(start_url)
    for link in links:
        if link not in visited:
            visited |= get_all_links_from_website(link, visited)
    
    return visited

In [None]:
all_links = get_all_links_from_website("https://fastkafka.airt.ai")
print(all_links)
print(f"\n\n{len(all_links)=}")

assert len(all_links) > 0
assert 'https://fastkafka.airt.ai/docs/CHANGELOG' in all_links
assert 'https://fastkafka.airt.ai/docs' in all_links

{'https://fastkafka.airt.ai/docs', 'https://fastkafka.airt.ai/docs/cli/fastkafka', 'https://fastkafka.airt.ai/docs/guides/Guide_04_Github_Actions_Workflow', 'https://fastkafka.airt.ai/docs/guides/Guide_11_Consumes_Basics', 'https://fastkafka.airt.ai/docs/guides/Guide_30_Using_docker_to_deploy_fastkafka', 'https://fastkafka.airt.ai/docs/api/fastkafka', 'https://fastkafka.airt.ai/docs/CHANGELOG', 'https://fastkafka.airt.ai/docs/cli/run_fastkafka_server_process', 'https://fastkafka.airt.ai/docs/guides/Guide_21_Produces_Basics', 'https://fastkafka.airt.ai/docs/guides/Guide_31_Using_redpanda_to_test_fastkafka', 'https://fastkafka.airt.ai', 'https://fastkafka.airt.ai/docs/api/fastkafka/KafkaEvent', 'https://fastkafka.airt.ai/docs/api/fastkafka/testing/LocalRedpandaBroker', 'https://fastkafka.airt.ai/docs/api/fastkafka/testing/ApacheKafkaBroker', 'https://fastkafka.airt.ai/docs/api/fastkafka/testing/Tester', 'https://fastkafka.airt.ai/docs/guides/Guide_22_Partition_Keys'}


len(all_links)=16


In [None]:
# | export

def get_service_context() -> ServiceContext:
    """Return a service context object initialized with an LLM predictor based on the gpt-3.5-turbo model
    
    Returns:
        A ServiceContext object with an LLMPredictor and a chunk size limit.
    """
    llm_predictor = LLMPredictor(
        llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")
    )
    service_context = ServiceContext.from_defaults(
        llm_predictor=llm_predictor, chunk_size_limit=512
    )
    
    return service_context

In [None]:
service_context = get_service_context()

print(service_context)
assert type(service_context) == ServiceContext

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ServiceContext(llm_predictor=<llama_index.llm_predictor.base.LLMPredictor object>, prompt_helper=<llama_index.indices.prompt_helper.PromptHelper object>, embed_model=<llama_index.embeddings.openai.OpenAIEmbedding object>, node_parser=<llama_index.node_parser.simple.SimpleNodeParser object>, llama_logger=<llama_index.logger.base.LlamaLogger object>, chunk_size_limit=512)


In [None]:
# | export

def write_compressed_json(json_string: str, file_path: str) -> None:
    """Compresses a JSON string and writes it to disk at the specified file path with a .gz extension.
    
    Args:
        json_string: The JSON string to compress and write to disk.
        file_path: The path to write the compressed JSON file to, without the .gz extension.
    """
    with gzip.open(file_path + '.gz', 'wb') as f_out:
        json_bytes = json_string.encode('utf-8')
        f_out.write(json_bytes)
        
        
def load_compressed_json(gziped_json_file_path: str) -> str:
    """Load a compressed JSON file from disk and returns its contents as a string.
    
    Args:
        gziped_json_file_path: The path to the compressed JSON file to read.
    """
    with gzip.open(gziped_json_file_path, 'rb') as f:
        json_bytes = f.read()
        json_str = json_bytes.decode('utf-8')
        return json_str

In [None]:
with TemporaryDirectory() as d:
    data_path = Path(d) / "data"
    data_path.mkdir(parents=True)
    
    expected =  '{ "name":"John", "age":30, "city":"New York"}'
    file_path=f"{d}/data/website_index.json"
    
    write_compressed_json(expected, file_path)
    assert Path(file_path + '.gz').exists()
    
    actual = load_compressed_json(file_path + '.gz')
    print(actual)
    
    assert actual == expected

{ "name":"John", "age":30, "city":"New York"}
