In [None]:
import requests
from xml.etree import ElementTree
from preprocess_documents2 import create_vector_db

In [None]:
def get_pydantic_ai_docs_urls():
    """
    Fetches all URLs from the Pydantic AI documentation.
    Uses the sitemap (https://ai.pydantic.dev/sitemap.xml) to get these URLs.
    
    Returns:
        List[str]: List of URLs
    """            
    sitemap_url = "https://muscleandstrength.com/sitemap.xml?page=1"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }
    try:
        response = requests.get(sitemap_url, headers=headers)
        response.raise_for_status()
        
        # Parse the XML
        root = ElementTree.fromstring(response.content)
        
        # Extract all URLs from the sitemap
        # The namespace is usually defined in the root element
        namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
        
        return urls
    except Exception as e:
        print(f"Error fetching sitemap: {e}")
        return []        

In [None]:
urls = get_pydantic_ai_docs_urls()
print(f"Found {len(urls)} URLs")


In [None]:
for url in urls[0:12]:
    print(url)

In [None]:
urls

In [None]:
def load_urls_from_file(file_path):
    """
    Loads URLs from a given text file.
    
    Args:
        file_path (str): The path to the text file containing URLs.
        
    Returns:
        List[str]: A list of URLs.
    """
    try:
        with open(file_path, 'r') as file:
            recipes = file.read().splitlines()
        return recipes
    except Exception as e:
        print(f"Error loading URLs from file: {e}")
        return []

# Example usage


In [None]:
file_path = 'RAG_Data_Collection/recipe_urls.txt'  # Replace with the path to your text file
recipe_urls = load_urls_from_file(file_path)
print(f"Loaded {len(recipe_urls)} URLs from file")


In [None]:
urls_list = urls + recipe_urls
print(f"Total URLs: {len(urls_list)}")

In [None]:
create_vector_db(urls_list)