In [2]:
import time
import json
import requests
import random
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin
from tqdm.notebook import tqdm
from collections import deque

root = 'https://github.com'
base_url = 'https://github.com/hissain'

ollama_url_inf = "http://localhost:11434/api/show"
ollama_url_emb = "http://localhost:11434/api/embeddings"
ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

VERBOSE = True
DEPTH = 1
MAX_LEN = 2

chrome_options = Options()
chrome_options.add_argument('--enable-javascript')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')

service = Service()
driver = webdriver.Chrome(service=service, options=chrome_options)

def apply_stealth(driver):
    stealth(driver, languages=["en-US", "en"], vendor="Google Inc.", platform="Win32", 
        webgl_vendor="Intel Inc.", renderer="Intel Iris OpenGL Engine", fix_hairline=True,
        run_on_insecure_content=True, fake_media_devices=True)

apply_stealth(driver)

def extract_clean_text_from_html(html_content):

    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove unwanted tags
    for script in soup(["script", "style", "footer", "header", "nav"]):
        script.decompose()
    
    # Get clean text
    text = soup.get_text(separator=' ', strip=True)
    text = ' '.join(text.split())
    return text

def scrape_recursive(base_url, depth=DEPTH, maxLen=MAX_LEN):

    visited_urls = set()
    text_data = []
    queue = deque([(base_url, 0)])  # (url, current_depth)
    
    while queue and len(visited_urls) < maxLen:
        
        url, current_depth = queue.popleft()
        if url in visited_urls or current_depth > depth:
            continue
        if not url.startswith(root):
            continue
        if '#' in url:
            continue
        print("Processing: ", url)
        
        visited_urls.add(url)
        
        try:
            #apply_stealth(driver)
            driver.get(url)
            time.sleep(1)
            
            page_text = extract_clean_text_from_html(driver.page_source)
            text_data.append((url, page_text))
            
            if current_depth < depth:
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                links = soup.find_all('a', href=True)
                
                for link in links:
                    new_url = urljoin(base_url, link['href'])
                    if new_url not in visited_urls and new_url.startswith(root):
                        queue.append((new_url, current_depth + 1))
        
        except Exception as e:
            if VERBOSE:
                print(f"Error with URL {url}: {e}")
    
    return text_data

try:
    scraped_data = scrape_recursive(base_url)
finally:
    #driver.quit()
    print("Finished!")

#scraped_data = []

print("Total url pased: ", len(scraped_data))
all_text = '\n'.join([s for _, s in scraped_data])
print(len(all_text))

#for url, text in scraped_data:
#    print(f"URL: {url}\ncontent size: {len(text)}\nText: {text[:200]}...\n")

Processing:  https://github.com/hissain
Processing:  https://github.com/
Finished!
Total url pased:  2
24649


In [9]:
import numpy as np
from qdrant_client import QdrantClient, models
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer

qdrant_url = "http://localhost:6333"
collection_name = "github_collection"
VERBOSE = True

client = QdrantClient(url=qdrant_url)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def partition_text(text, max_length):
    sentences = text.split('. ')
    partitions, current_part, current_length = [], [], 0

    for sentence in tqdm(sentences, desc="Partitioning text"):
        sentence_length = len(sentence.split())
        if current_length + sentence_length > max_length:
            partitions.append('. '.join(current_part))
            current_part, current_length = [], 0
        current_part.append(sentence)
        current_length += sentence_length

    if current_part:
        partitions.append('. '.join(current_part))
    return partitions

def get_embedding(text):
    return embedding_model.encode(text)

def create_collection(dimension):
    try:
        client.delete_collection(collection_name=collection_name)
    except Exception:
        pass

    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(size=dimension, distance=models.Distance.COSINE),
    )
    
def upsert_points(embeddings, partitions):
    points = [
        models.PointStruct(id=i, vector=embedding.tolist(), payload={"text": partition})
        for i, (embedding, partition) in enumerate(zip(embeddings, partitions))
    ]
    client.upsert(collection_name=collection_name, points=points)

def store_in_qdrant(partitions):
    dimension = 384  # Dimension for 'all-MiniLM-L6-v2'
    create_collection(dimension)
    embeddings = [get_embedding(partition) for partition in tqdm(partitions, desc="Generating embeddings")]
    print("Embeddings len:", len(embeddings))
    upsert_points(embeddings, partitions)

def search_points(query_embedding, k=5):
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_embedding.tolist(),
        limit=k,
        with_payload=True
    )
    return [hit.payload["text"] for hit in search_result]

def ask(query, k=2, p=False):
    query_embedding = get_embedding(query)
    retrieved_docs = search_points(query_embedding, k)
    combined_docs = "\n".join(retrieved_docs)
    
    inst = "Instruction: If you do not find the answer in the context, just say you don't know."
    rag_prompt = f"{inst}\n\nContext:\n{combined_docs}\n\nQuery: {query}\nAnswer:"
    if p:
        print(rag_prompt)
    payload = {"model": "llama3.2:latest", "prompt": rag_prompt, "stream": False}
    headers = {"Content-Type": "application/json"}
    response = requests.post(ollama_url_gen, headers=headers, data=json.dumps(payload))
    print(response.json().get("response", "No response available"))

partitions = partition_text(all_text, max_length=512)
print(f'Total partition count: {len(partitions)}')
try:
    store_in_qdrant(partitions)
except Exception as e:
    print(f"Error storing in Qdrant: {e}")

Partitioning text:   0%|          | 0/466 [00:00<?, ?it/s]

Total partition count: 8


Generating embeddings:   0%|          | 0/8 [00:00<?, ?it/s]

Embeddings len: 8


In [5]:
ask("What is Hissains experience?", 2)

'He has over 13 years of experience in mobile and wearable software development, with expertise in OOP, Android (Java, Kotlin), iOS (Swift, Objective-C), Xcode, Version Control Systems, System Design, Application Architecture, Development Processes, Wearable & Hearable Technology.'

In [10]:
ask("Whats Hissains special interest?", 2)

He is interested in several things, including:

1. Technological Innovation
2. Human-Machine Interaction
3. Information Theory
4. Astronomy
5. Probability
6. Theory of Relativity
7. Philosophy of Science

And also he enjoys music and playing piano and guitar as a special interest
