In [None]:
import time
import faiss
import json
import requests
import random
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin
from tqdm.notebook import tqdm
from collections import deque

root = 'https://github.com'
base_url = 'https://github.com/hissain'

ollama_url_inf = "http://localhost:11434/api/show"
ollama_url_emb = "http://localhost:11434/api/embeddings"
ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

VERBOSE = True
DEPTH = 1
MAX_LEN = 2

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
]

chrome_options = Options()
chrome_options.add_argument('--enable-javascript')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument(f"user-agent={random.choice(USER_AGENTS)}")

service = Service()
driver = webdriver.Chrome(service=service, options=chrome_options)

def apply_stealth(driver):
    stealth(driver, languages=["en-US", "en"], vendor="Google Inc.", platform="Win32", 
        webgl_vendor="Intel Inc.", renderer="Intel Iris OpenGL Engine", fix_hairline=True,
        run_on_insecure_content=True, fake_media_devices=True)

apply_stealth(driver)

def extract_clean_text_from_html(html_content):

    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove unwanted tags
    for script in soup(["script", "style", "footer", "header", "nav"]):
        script.decompose()
    
    # Get clean text
    text = soup.get_text(separator=' ', strip=True)
    text = ' '.join(text.split())
    return text

def scrape_recursive(base_url, depth=DEPTH, maxLen=MAX_LEN):

    visited_urls = set()
    text_data = []
    queue = deque([(base_url, 0)])  # (url, current_depth)
    
    while queue:
        url, current_depth = queue.popleft()
        if len(visited_urls) >= maxLen:
            continue
        if url in visited_urls or current_depth > depth:
            continue
        if not url.startswith(root):
            continue
        if '#' in url:
            continue
        print("Processing: ", url)
        
        visited_urls.add(url)
        
        try:
            apply_stealth(driver)
            driver.get(url)
            time.sleep(1)
            
            page_text = extract_clean_text_from_html(driver.page_source)
            text_data.append((url, page_text))
            
            if current_depth < depth:
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                links = soup.find_all('a', href=True)
                
                for link in links:
                    new_url = urljoin(base_url, link['href'])
                    if new_url not in visited_urls and new_url.startswith(root):
                        queue.append((new_url, current_depth + 1))
        
        except Exception as e:
            if VERBOSE:
                print(f"Error with URL {url}: {e}")
    
    return text_data

try:
    scraped_data = scrape_recursive(base_url)
finally:
    #driver.quit()
    print("Finished!")

print("Total url pased: ", len(scraped_data))

for url, text in scraped_data:
    print(f"URL: {url}\ncontent size: {len(text)}\nText: {text[:200]}...\n")

In [None]:
all_text = '\n'.join([s for _, s in scraped_data])
print(len(all_text))

In [None]:
import time
import json
import requests
import aiohttp
import random
import numpy as np
from tqdm.notebook import tqdm
from collections import deque
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, Filter
from urllib.parse import urljoin

qdrant_client = QdrantClient("http://localhost:6333")
collection_name = "rag_collection"

def partition_text(text, max_length):
    sentences = text.split('. ')
    partitions = []
    current_part = []
    current_length = 0

    for sentence in tqdm(sentences, desc="Partitioning text"):
        sentence_length = len(sentence.split())
        
        if current_length + sentence_length > max_length:
            partitions.append('. '.join(current_part))
            current_part = []
            current_length = 0
        
        current_part.append(sentence)
        current_length += sentence_length
    
    if current_part:
        partitions.append('. '.join(current_part))

    return partitions

async def get_embedding_shape():
    async with aiohttp.ClientSession() as session:
        payload = {"model": ollama_model_name}
        headers = {"Content-Type": "application/json"}
        async with session.post(ollama_url_inf, headers=headers, data=json.dumps(payload)) as response:
            if response.status == 200:
                result = await response.json()
                return result['model_info']["llama.embedding_length"]
            else:
                print(f"ERROR: Error from Ollama: {response.status}")
                return 0

async def get_embeddings(partitions):
    embeddings = []
    async with aiohttp.ClientSession() as session:
        for partition in tqdm(partitions, desc="Fetching embeddings"):
            payload = {"model": ollama_model_name, "prompt": partition}
            headers = {"Content-Type": "application/json"}
            async with session.post(ollama_url_emb, headers=headers, data=json.dumps(payload)) as response:
                if response.status == 200:
                    result = await response.json()
                    embeddings.append(np.array(result.get('embedding', np.zeros(768))))
                else:
                    print(f"ERROR: Error from Ollama: {response.status}")
                    embeddings.append(np.zeros(768))
    return embeddings

async def store_in_qdrant(partitions):
    dimension = await get_embedding_shape()
    if VERBOSE:
        print(f"Model's embedding dimension: {dimension}")

    qdrant_client.recreate_collection(
        collection_name=collection_name,
        vectors_config={"size": dimension, "distance": "Cosine"},
    )

    embeddings = await get_embeddings(partitions)
    points = [
        PointStruct(id=i, vector=embedding.tolist(), payload={"text": partition})
        for i, (embedding, partition) in enumerate(zip(embeddings, partitions))
    ]
    qdrant_client.upsert(collection_name=collection_name, points=points)

async def retrieve_with_rag(query, partitions, k=5):
    query_embedding = (await get_embeddings([query]))[0]
    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_embedding.tolist(),
        limit=k,
        with_payload=True
    )
    
    retrieved_docs = [hit.payload["text"] for hit in search_result]
    combined_docs = "\n".join(retrieved_docs)
    
    inst = "Instruction: If you do not find the answer in the context, just say you don't know."
    rag_prompt = f"{inst}\n\nContext:\n{combined_docs}\n\nQuery: {query}\nAnswer:"
    
    async with aiohttp.ClientSession() as session:
        payload = {"model": ollama_model_name, "prompt": rag_prompt, "stream": False}
        headers = {"Content-Type": "application/json"}
        async with session.post(ollama_url_gen, headers=headers, data=json.dumps(payload)) as response:
            return await response.json()

async def ask(query):
    rag_response = await retrieve_with_rag(query, partitions)
    return rag_response.get("response", "No response available")

partitions = partition_text(all_text, max_length=512)

if VERBOSE:
    print(f'Total partition count: {len(partitions)}')

await store_in_qdrant(partitions)

In [None]:
#!pip install qdrant_client