In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
#from transformers import GPT2TokenizerFast
from transformers import AutoTokenizer
import re
from tqdm.notebook import tqdm
import time

#tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

model_path = '/Users/hissain/git/github/models/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_path)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    service = Service()
    return webdriver.Chrome(service=service, options=options)

def extract_text_from_url(url, driver):
    driver.get(url)
    time.sleep(1)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    for element in soup.find_all("div", class_="js-yearly-contributions"):
        element.decompose()
        
    for element in soup(["script", "style"]):
        element.decompose()
    text = soup.get_text(separator=" ")
    return text

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)           # Remove extra whitespace
    text = re.sub(r'[^\w\s.,!?\'"()-]', '', text)  # Remove special characters
    text = text.lower()                         # Normalize to lowercase
    return text.strip()

def split_sentences(text):
    sentences = re.split(r'(?<=[.!?]) +', text)  # Split on sentence boundaries
    return sentences

def count_tokens(text):
    return len(tokenizer.encode(text))

def partition_sentences(sentences, url, max_tokens=512, overlap=1):
    chunks, current_chunk = [], []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = count_tokens(sentence)
        
        if current_tokens + sentence_tokens > max_tokens:
            chunks.append({"text": " ".join(current_chunk), "url": url})
            current_chunk = current_chunk[-overlap:]
            current_tokens = count_tokens(" ".join(current_chunk))

        current_chunk.append(sentence)
        current_tokens += sentence_tokens

    if current_chunk:
        chunks.append({"text": " ".join(current_chunk), "url": url})

    return chunks

def process_urls(urls):
    driver = init_driver()
    all_chunks = []
    
    for url in tqdm(urls, desc="Processing URLs"):
        try:
            raw_text = extract_text_from_url(url, driver)
            clean_text_content = clean_text(raw_text)
            sentences = split_sentences(clean_text_content)
            chunks = partition_sentences(sentences, url, max_tokens=512, overlap=1)
            all_chunks.extend(chunks)
        except Exception as e:
            print(f"Failed to process {url}: {e}")
    
    #driver.quit()
    return all_chunks

urls = [
    "https://github.com/hissain",
    "https://github.com/hissain/CoronaTracker",
]

rag_chunks = process_urls(urls)

for i, chunk in enumerate(rag_chunks[:2]):
    print(f"Chunk {i+1}:\nText: {chunk['text']}\nURL: {chunk['url']}\n")

In [None]:
import numpy as np
from qdrant_client import QdrantClient, models
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
import requests
import aiohttp
import json
import asyncio

qdrant_url = "http://localhost:6333"
collection_name = "github_collection"

ollama_url_inf = "http://localhost:11434/api/show"
ollama_url_emb = "http://localhost:11434/api/embeddings"
ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

client = QdrantClient(url=qdrant_url)
embedding_model = SentenceTransformer(model_path)

def get_embedding(text):
    return embedding_model.encode(text)

def create_collection(dimension):
    try:
        client.delete_collection(collection_name=collection_name)
    except Exception:
        pass

    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(size=dimension, distance=models.Distance.COSINE),
    )
    
def upsert_points_with_metadata(embeddings, chunks):
    points = [
        models.PointStruct(id=i, vector=embedding.tolist(), payload={"text": chunk["text"], "url": chunk["url"]})
        for i, (embedding, chunk) in enumerate(zip(embeddings, chunks))
    ]
    client.upsert(collection_name=collection_name, points=points)

def store_in_qdrant_with_metadata(chunks):
    dimension = 384  # Dimension for 'all-MiniLM-L6-v2'
    create_collection(dimension)
    embeddings = [get_embedding(chunk["text"]) for chunk in tqdm(chunks, desc="Generating embeddings")]
    upsert_points_with_metadata(embeddings, chunks)

def search_points_with_metadata(query_embedding, k=3):
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_embedding.tolist(),
        limit=k,
        with_payload=True
    )
    return [{"text": hit.payload["text"], "url": hit.payload["url"]} for hit in search_result]

async def ask(prompt, stream=True):
    global chat_history
    in_code_block = False
    
    query_embedding = get_embedding(query)
    retrieved_docs = search_points_with_metadata(query_embedding, k)
    
    combined_docs = "\n\n".join([f"Source: {doc['url']}\n{doc['text']}" for doc in retrieved_docs])
    inst = "Instruction: If you do not find the answer in the context, just say you don't know."
    rag_prompt = f"{inst}\n\nContext:\n{combined_docs}\n\nQuery: {query}\nAnswer:"
    if p:
        print(rag_prompt)
        
    payload = {"model": ollama_model_name, "prompt": rag_prompt, "stream": True}
    headers = {"Content-Type": "application/json"}

    response_text = ""
    buffer = ""

    async with aiohttp.ClientSession() as session:
        async with session.post(ollama_url_chat, headers=headers, data=json.dumps(payload)) as response:
            if stream:
                async for chunk in response.content.iter_any():
                    try:
                        data = json.loads(chunk.decode('utf-8'))
                        content = data.get("response", "")
                        buffer += content

                        if len(buffer) > 10:
                            response_text += buffer
                            clear_output(wait=True)
                            display(Markdown(response_text))
                            buffer = ""
                            
                    except json.JSONDecodeError:
                        continue

                response_text += buffer
                clear_output(wait=True)
                display(Markdown(response_text))
            else:
                response_text = await response.text()
                display(Markdown(response_text))

    return response_text

try:
    store_in_qdrant_with_metadata(rag_chunks)
    print(f'Stored {len(rag_chunks)} chunks')
except Exception as e:
    print(f"Error storing in Qdrant: {e}")


In [None]:
ask("What are Hissain's special interests?")