In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
#from transformers import GPT2TokenizerFast
from transformers import AutoTokenizer
import re
from tqdm.notebook import tqdm
import time

#tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

model_path = '/Users/hissain/git/github/models/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_path)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    service = Service()
    return webdriver.Chrome(service=service, options=options)

def extract_text_from_url(url, driver):
    driver.get(url)
    time.sleep(1)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    soup = soup.body
    if soup is None:
        return "" 
    
    for script in soup(["script", "style", "footer", "header", "nav", "a"]):
        script.decompose()
    
    for element in soup.find_all("div", class_="d-flex"):
        element.decompose()
    
    for hidden_elem in soup.find_all(lambda tag: tag.has_attr('hidden') or 
                                     (tag.has_attr('style') and ('display: none' in tag['style'] or 'visibility: hidden' in tag['style']))):
        hidden_elem.decompose()
        
    text = soup.get_text(separator=" ")
    return text

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)           # Remove extra whitespace
    text = re.sub(r'[^\w\s.,!?\'"()-]', '', text)  # Remove special characters
    text = text.lower()                         # Normalize to lowercase
    return text.strip()

def split_sentences(text):
    sentences = re.split(r'(?<=[.!?]) +', text)  # Split on sentence boundaries
    return sentences

def count_tokens(text):
    return len(tokenizer.encode(text))

def partition_sentences(sentences, url, max_tokens=512, overlap=1):
    chunks, current_chunk = [], []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = count_tokens(sentence)
        
        if current_tokens + sentence_tokens > max_tokens:
            chunks.append({"text": " ".join(current_chunk), "url": url})
            current_chunk = current_chunk[-overlap:]
            current_tokens = count_tokens(" ".join(current_chunk))

        current_chunk.append(sentence)
        current_tokens += sentence_tokens

    if current_chunk:
        chunks.append({"text": " ".join(current_chunk), "url": url})

    return chunks

def process_urls(urls):
    driver = init_driver()
    all_chunks = []
    
    for url in tqdm(urls, desc="Processing URLs"):
        try:
            raw_text = extract_text_from_url(url, driver)
            clean_text_content = clean_text(raw_text)
            sentences = split_sentences(clean_text_content)
            chunks = partition_sentences(sentences, url, max_tokens=512, overlap=1)
            all_chunks.extend(chunks)
        except Exception as e:
            print(f"Failed to process {url}: {e}")
    
    #driver.quit()
    return all_chunks

urls = [
    "https://github.com/hissain",
    "https://github.com/hissain/CoronaTracker",
    "https://github.com/hissain/minimal-cmsis-dsp",
    "https://github.com/hissain/ml"
]

rag_chunks = process_urls(urls)
print(f"Total chunks: {len(rag_chunks)}")

Processing URLs:   0%|          | 0/4 [00:00<?, ?it/s]

Total chunks: 8


In [12]:
for i, chunk in enumerate(rag_chunks[:1]):
    print(f"Chunk {i+1}:\nText: {chunk['text']}\nURL: {chunk['url']}\n")

Chunk 1:
Text: hissain hissain block or report block or report hissain block user prevent this user from interacting with your repositories and sending you notifications. learn more about . you must be logged in to block users. add an optional note please don't include any personal information such as legal names or email addresses. maximum 100 characters, markdown supported. this note will be visible to only you. block user report abuse contact github support about this users behavior. learn more about . about me i am an accomplished associate architect with over 13 years of experience in mobile and wearables software development. with a deep understanding of the challenges inherent in developing performant, robust, testable, and maintainable applications, from requirement analysis to architecture, design, development, and maintenance, i am confident in my ability to lead software development and commercialization for any organization. my experience includes working on samsung health 

In [3]:
import numpy as np
from qdrant_client import QdrantClient, models
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
from IPython.display import display, clear_output, Markdown
import requests
import json
import asyncio

session = requests.Session()
session.headers.update({"Connection": "keep-alive", "Content-Type": "application/json"})

qdrant_url = "http://localhost:6333"
collection_name = "github_collection"

ollama_url_inf = "http://localhost:11434/api/show"
ollama_url_emb = "http://localhost:11434/api/embeddings"
ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

client = QdrantClient(url=qdrant_url)
embedding_model = SentenceTransformer(model_path)

def get_embedding(text):
    return embedding_model.encode(text)

def create_collection(dimension):
    try:
        client.delete_collection(collection_name=collection_name)
    except Exception:
        pass

    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(size=dimension, distance=models.Distance.COSINE),
    )
    
def upsert_points_with_metadata(embeddings, chunks):
    points = [
        models.PointStruct(id=i, vector=embedding.tolist(), payload={"text": chunk["text"], "url": chunk["url"]})
        for i, (embedding, chunk) in enumerate(zip(embeddings, chunks))
    ]
    client.upsert(collection_name=collection_name, points=points)

def store_in_qdrant_with_metadata(chunks):
    dimension = 384  # Dimension for 'all-MiniLM-L6-v2'
    create_collection(dimension)
    embeddings = [get_embedding(chunk["text"]) for chunk in tqdm(chunks, desc="Generating embeddings")]
    upsert_points_with_metadata(embeddings, chunks)

def search_points_with_metadata(query_embedding, k=3):
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_embedding.tolist(),
        limit=k,
        with_payload=True
    )
    return [{"text": hit.payload["text"], "url": hit.payload["url"]} for hit in search_result]

def ask(query, k=3, p=False):
    
    query_embedding = get_embedding(query)
    retrieved_docs = search_points_with_metadata(query_embedding, k)
    
    combined_docs = "\n\n".join([f"Source: {doc['url']}\n{doc['text']}" for doc in retrieved_docs])
    inst = "Instruction: If you do not find the answer in the context, just say you don't know."
    rag_prompt = f"{inst}\n\nContext:\n{combined_docs}\n\nQuery: {query}\nAnswer:"
    if p:
        print(rag_prompt)
        
    payload = {"model": ollama_model_name, "prompt": rag_prompt, "stream": True}
    headers = {"Content-Type": "application/json"}

    response_text = ""
    buffer = ""

    response = session.post(ollama_url_gen, headers=headers, data=json.dumps(payload), stream=True)

    # Process the response content as it arrives
    if response.status_code == 200:
        for chunk in response.iter_content(chunk_size=None):
            try:
                data = json.loads(chunk.decode('utf-8'))
                content = data.get("response", "")
                buffer += content

                # Display output every few characters for real-time effect
                if len(buffer) > 10:
                    response_text += buffer
                    clear_output(wait=True)
                    display(Markdown(response_text))
                    buffer = ""
                    
            except json.JSONDecodeError:
                continue

        # Display any remaining buffered content
        response_text += buffer
        clear_output(wait=True)
        display(Markdown(response_text))
    else:
        print("Request failed:", response.status_code, response.text)

    return response_text

try:
    store_in_qdrant_with_metadata(rag_chunks)
    print(f'Stored {len(rag_chunks)} chunks')
except Exception as e:
    print(f"Error storing in Qdrant: {e}")


Generating embeddings:   0%|          | 0/8 [00:00<?, ?it/s]

Stored 8 chunks


In [4]:
_ = ask("What are Hissain's special interests?")

Technological innovation, human-machine interaction, information theory, astronomy, probability, theory of relativity, philosophy of science, piano, guitar, and poetry.

In [5]:
_ = ask("Tell me about CoronaTracker")

CoronaTracker appears to be a project aimed at tracking COVID-19 cases, close contacts, and geographic locations of citizens. The app uses GPS, Bluetooth, and NFC data to detect close contacts for individuals who have tested positive for COVID-19, with the goal of identifying potential candidates for quarantine and further testing.

In [6]:
_ = ask("Any repository for Machine learning?")

Yes, there are several repositories available on GitHub for machine learning. One of them is the "hissain/ml" repository you mentioned earlier (https://github.com/hissain/ml), which contains a collection of basic and frequently used machine learning code snippets in various projects and tasks.

Additionally, you can also explore other repositories such as:

* https://github.com/hissain/minimal-cmsis-dsp (which includes some machine learning related files)
* https://github.com/hissain (Hissain's GitHub profile page which has a list of his projects including one that might be related to machine learning)

Please note that the first repository you mentioned is specifically focused on machine learning code snippets, and it's worth checking it out if you're looking for some inspiration or examples.

In [7]:
ollama_url_chat = "http://localhost:11434/api/chat"

chat_history = []

def chat(query, k=2, p=False, stream=True):
    global chat_history
    
    query_embedding = get_embedding(query)
    retrieved_docs = search_points_with_metadata(query_embedding, k)
    
    combined_docs = "\n\n".join([f"Source: {doc['url']}\n{doc['text']}" for doc in retrieved_docs])
    inst = "Instruction: If you do not find the answer in the context, just say you don't know."
    rag_prompt = f"{inst}\n\nContext:\n{combined_docs}\n\nQuery: {query}\nAnswer:"
    if p:
        print(rag_prompt)
        
    chat_history.append({"role": "user", "content": rag_prompt})
    payload = {"model": ollama_model_name, "messages": chat_history, "stream": stream}
    headers = {"Connection": "keep-alive", "Content-Type": "application/json"}

    response_text = ""
    buffer = ""

    response = session.post(ollama_url_chat, data=json.dumps(payload), stream=stream)

    # Process the response content as it arrives
    if response.status_code == 200:
        for chunk in response.iter_content(chunk_size=None):
            try:
                data = json.loads(chunk.decode('utf-8'))
                content = data.get("message", {}).get("content", "")
                buffer += content

                # Display output every few characters for real-time effect
                if len(buffer) > 10:
                    response_text += buffer
                    clear_output(wait=True)
                    display(Markdown(response_text))
                    buffer = ""
                    
            except json.JSONDecodeError:
                continue

        # Display any remaining buffered content
        response_text += buffer
        clear_output(wait=True)
        display(Markdown(response_text))
    else:
        print("Request failed:", response.status_code, response.text)

    return response_text

In [8]:
_ = chat("Whats Hissain experience?")

Hissain has over 13 years of experience in mobile and wearables software development. He has worked on Samsung Health for iOS, which boasts over 6 million users and approximately 160k downloads on the App Store market, with a global rating of 4.5. Additionally, he has achieved six patent applications granted by Samsung SIPMS between 2021 and 2024, currently in the process of being published in USPTO among them, one patent has already been published in USPTO, Wipo, and KR.

In [9]:
_ = chat("Whats his topic of special interest?")

1. Technological innovation
2. Human-machine interaction
3. Information theory
4. Astronomy
5. Probability
6. Theory of relativity
7. Philosophy of science
8. Piano
9. Guitar
10. Poetry 

Additionally, his special interests are not limited to these areas, as he mentions that "I value the passion of problem-solving minds and firmly believe that software engineering is the profession that suits me best."

In [13]:
_ = chat("Whats the license for https://github.com/hissain/ml repository?")

This repository is licensed under the . feel free to use and modify the code as needed for your own projects.