In [None]:
# Imports
import time
import faiss
import json
import requests
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin
from tqdm.notebook import tqdm
from collections import deque

# Constants
root = 'https://www.web.com'
base_url = 'https://www.web.com'

ollama_url_inf = "http://localhost:11434/api/show"
ollama_url_emb = "http://localhost:11434/api/embeddings"
ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

VERBOSE = True
DEPTH = 0

# Configure Selenium Chrome options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

service = Service()
driver = webdriver.Chrome(service=service, options=chrome_options)

stealth(driver, languages=["en-US", "en"], vendor="Google Inc.", platform="Win32", 
        webgl_vendor="Intel Inc.", renderer="Intel Iris OpenGL Engine", fix_hairline=True)

def extract_clean_text_from_html(html_content):
    """Extract and clean text from HTML content."""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove unwanted tags
    for script in soup(["script", "style", "footer", "header", "nav"]):
        script.decompose()
    
    # Get clean text
    text = soup.get_text(separator=' ', strip=True)
    text = ' '.join(text.split())  # Remove extra whitespace
    return text

def scrape_recursive(base_url, depth=DEPTH):
    """Scrape text recursively up to a specified depth."""
    visited_urls = set()
    text_data = []
    queue = deque([(base_url, 0)])  # (url, current_depth)
    
    while queue:
        url, current_depth = queue.popleft()

        print("Processing: ", url)
        
        if url in visited_urls or current_depth > depth:
            continue
        
        if not url.startswith(root):
            continue
            
        if '#' in url:
            continue
        
        visited_urls.add(url)
        
        try:
            driver.get(url)
            time.sleep(1)
            
            page_text = extract_clean_text_from_html(driver.page_source)
            text_data.append((url, page_text))
            
            # Proceed only if within specified depth
            if current_depth < depth:
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                links = soup.find_all('a', href=True)
                
                # Find and queue up all valid links on the page
                for link in links:
                    new_url = urljoin(base_url, link['href'])
                    if new_url not in visited_urls and new_url.startswith(root):
                        queue.append((new_url, current_depth + 1))
        
        except Exception as e:
            if VERBOSE:
                print(f"Error with URL {url}: {e}")
    
    return text_data

# Run scraping and clean up driver
try:
    scraped_data = scrape_recursive(base_url)
finally:
    #driver.quit()
    print("Finished!")

print("Total text size: ", len(scraped_data))

for url, text in scraped_data:
    print(f"URL: {url}\nText: {text[:200]}...\n")  # Print a snippet of the text

In [49]:
import json
import requests
import numpy as np
import faiss
import asyncio
import aiohttp
from tqdm.notebook import tqdm

# Constants (assuming these are defined)
VERBOSE = True

# Function to partition text efficiently
def partition_text(text, max_length):
    sentences = text.split('. ')
    partitions = []
    current_part = []
    current_length = 0

    for sentence in tqdm(sentences, desc="Partitioning text"):
        sentence_length = len(sentence.split())
        
        if current_length + sentence_length > max_length:
            partitions.append('. '.join(current_part))
            current_part = []
            current_length = 0
        
        current_part.append(sentence)
        current_length += sentence_length
    
    if current_part:
        partitions.append('. '.join(current_part))

    return partitions


# Asynchronous function to fetch model embedding shape
async def get_embedding_shape():
    async with aiohttp.ClientSession() as session:
        payload = {"model": ollama_model_name}
        headers = {"Content-Type": "application/json"}
        async with session.post(ollama_url_inf, headers=headers, data=json.dumps(payload)) as response:
            if response.status == 200:
                result = await response.json()
                if 'model_info' in result and 'llama.embedding_length' in result['model_info']:
                    return result['model_info']["llama.embedding_length"]
                else:
                    print("ERROR: Embedding length not found in model info.")
                    return 0
            else:
                print(f"ERROR: Error from Ollama: {response.status}")
                return 0

# Asynchronous function to fetch embeddings for a batch of text partitions
async def get_embeddings(partitions):
    embeddings = []
    async with aiohttp.ClientSession() as session:
        for partition in tqdm(partitions, desc="Fetching embeddings"):
            payload = {"model": ollama_model_name, "prompt": partition}
            headers = {"Content-Type": "application/json"}
            async with session.post(ollama_url_emb, headers=headers, data=json.dumps(payload)) as response:
                if response.status == 200:
                    result = await response.json()
                    embeddings.append(np.array(result.get('embedding', np.zeros(768))))  # Adjust based on model
                else:
                    print(f"ERROR: Error from Ollama: {response.status}")
                    embeddings.append(np.zeros(768))
    return embeddings

# Store embeddings in FAISS with optimized bulk addition
async def store_in_faiss(partitions):
    dimension = await get_embedding_shape()
    if VERBOSE:
        print(f"Model's embedding dimension: {dimension}")
    index = faiss.IndexFlatL2(dimension)
    
    embeddings = await get_embeddings(partitions)
    if embeddings:
        index.add(np.array(embeddings))  # Bulk add for efficiency
    return index, list(range(len(partitions)))

# Retrieve function with RAG applied to FAISS
async def retrieve_with_rag(query, faiss_index, partitions, k=5):
    query_embedding = (await get_embeddings([query]))[0]
    distances, indices = faiss_index.search(np.array([query_embedding]), k=k)

    retrieved_docs = []
    for i in indices[0]:
        if i < len(partitions):
            retrieved_docs.append(partitions[i])

    combined_docs = "\n".join(retrieved_docs)
    rag_prompt = f"Context:\n{combined_docs}\n\nQuery: {query}\nAnswer:"
    
    async with aiohttp.ClientSession() as session:
        payload = {"model": ollama_model_name, "prompt": rag_prompt, "stream": False}
        headers = {"Content-Type": "application/json"}
        async with session.post(ollama_url_gen, headers=headers, data=json.dumps(payload)) as response:
            return await response.json()

# Function to initiate retrieval with RAG
async def ask(query):
    rag_response = await retrieve_with_rag(query, faiss_index, partitions)
    return rag_response.get("response", "No response available")

# Assuming you have `scraped_data` initialized as a list of text data
all_text = scraped_data[0][1]
partitions = partition_text(all_text, max_length=512)

if VERBOSE:
    print(f'Total partition count: {len(partitions)}')

faiss_index, doc_ids = await store_in_faiss(partitions)

Partitioning text:   0%|          | 0/409 [00:00<?, ?it/s]

Total partition count: 93
Model's embedding dimension: 3072


Fetching embeddings:   0%|          | 0/93 [00:00<?, ?it/s]

CancelledError: 