In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
#from transformers import GPT2TokenizerFast
from transformers import AutoTokenizer
import re
from tqdm.notebook import tqdm
import time

#tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

model_path = '/Users/hissain/git/github/models/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_path)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def init_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    service = Service()
    return webdriver.Chrome(service=service, options=options)

def extract_text_from_url(url, driver):
    driver.get(url)
    time.sleep(1)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    for element in soup.find_all("div", class_="js-yearly-contributions"):
        element.decompose()
        
    for element in soup(["script", "style"]):
        element.decompose()
    text = soup.get_text(separator=" ")
    return text

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)           # Remove extra whitespace
    text = re.sub(r'[^\w\s.,!?\'"()-]', '', text)  # Remove special characters
    text = text.lower()                         # Normalize to lowercase
    return text.strip()

def split_sentences(text):
    sentences = re.split(r'(?<=[.!?]) +', text)  # Split on sentence boundaries
    return sentences

def count_tokens(text):
    return len(tokenizer.encode(text))

def partition_sentences(sentences, url, max_tokens=512, overlap=1):
    chunks, current_chunk = [], []
    current_tokens = 0

    for i, sentence in enumerate(sentences):
        sentence_tokens = count_tokens(sentence)
        
        if current_tokens + sentence_tokens > max_tokens:
            chunks.append({"text": " ".join(current_chunk), "url": url})
            current_chunk = current_chunk[-overlap:]
            current_tokens = count_tokens(" ".join(current_chunk))

        current_chunk.append(sentence)
        current_tokens += sentence_tokens

    if current_chunk:
        chunks.append({"text": " ".join(current_chunk), "url": url})

    return chunks

def process_urls(urls):
    driver = init_driver()
    all_chunks = []
    
    for url in tqdm(urls, desc="Processing URLs"):
        try:
            raw_text = extract_text_from_url(url, driver)
            clean_text_content = clean_text(raw_text)
            sentences = split_sentences(clean_text_content)
            chunks = partition_sentences(sentences, url, max_tokens=512, overlap=1)
            all_chunks.extend(chunks)
        except Exception as e:
            print(f"Failed to process {url}: {e}")
    
    #driver.quit()
    return all_chunks

urls = [
    "https://github.com/hissain",
    "https://github.com/hissain/CoronaTracker",
]

rag_chunks = process_urls(urls)


Processing URLs:   0%|          | 0/2 [00:00<?, ?it/s]

Chunk 1:
Text: hissain (md. sazzad hissain khan)  github skip to content navigation menu toggle navigation sign in product github copilot write better code with ai security find and fix vulnerabilities actions automate any workflow codespaces instant dev environments issues plan and track work code review manage code changes discussions collaborate outside of code code search find more, search less explore all features documentation github skills blog solutions by company size enterprises small and medium teams startups by use case devsecops devops cicd view all use cases by industry healthcare financial services manufacturing government view all industries view all solutions resources topics ai devops security software development view all explore learning pathways white papers, ebooks, webinars customer stories partners open source github sponsors fund open source developers the readme project github community articles repositories topics trending collections enterprise enterprise pl

In [43]:
for i, chunk in enumerate(rag_chunks[:3]):
    print(f"Chunk {i+1}:\nText: {chunk['text']}\nURL: {chunk['url']}\n")

Chunk 1:
Text: hissain (md. sazzad hissain khan)  github skip to content navigation menu toggle navigation sign in product github copilot write better code with ai security find and fix vulnerabilities actions automate any workflow codespaces instant dev environments issues plan and track work code review manage code changes discussions collaborate outside of code code search find more, search less explore all features documentation github skills blog solutions by company size enterprises small and medium teams startups by use case devsecops devops cicd view all use cases by industry healthcare financial services manufacturing government view all industries view all solutions resources topics ai devops security software development view all explore learning pathways white papers, ebooks, webinars customer stories partners open source github sponsors fund open source developers the readme project github community articles repositories topics trending collections enterprise enterprise pl

In [2]:
import numpy as np
from qdrant_client import QdrantClient, models
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
from IPython.display import display, clear_output, Markdown
import requests
import json
import asyncio

qdrant_url = "http://localhost:6333"
collection_name = "github_collection"

ollama_url_inf = "http://localhost:11434/api/show"
ollama_url_emb = "http://localhost:11434/api/embeddings"
ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

client = QdrantClient(url=qdrant_url)
embedding_model = SentenceTransformer(model_path)

def get_embedding(text):
    return embedding_model.encode(text)

def create_collection(dimension):
    try:
        client.delete_collection(collection_name=collection_name)
    except Exception:
        pass

    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(size=dimension, distance=models.Distance.COSINE),
    )
    
def upsert_points_with_metadata(embeddings, chunks):
    points = [
        models.PointStruct(id=i, vector=embedding.tolist(), payload={"text": chunk["text"], "url": chunk["url"]})
        for i, (embedding, chunk) in enumerate(zip(embeddings, chunks))
    ]
    client.upsert(collection_name=collection_name, points=points)

def store_in_qdrant_with_metadata(chunks):
    dimension = 384  # Dimension for 'all-MiniLM-L6-v2'
    create_collection(dimension)
    embeddings = [get_embedding(chunk["text"]) for chunk in tqdm(chunks, desc="Generating embeddings")]
    upsert_points_with_metadata(embeddings, chunks)

def search_points_with_metadata(query_embedding, k=3):
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_embedding.tolist(),
        limit=k,
        with_payload=True
    )
    return [{"text": hit.payload["text"], "url": hit.payload["url"]} for hit in search_result]

def ask(query, k=3, p=False):
    
    query_embedding = get_embedding(query)
    retrieved_docs = search_points_with_metadata(query_embedding, k)
    
    combined_docs = "\n\n".join([f"Source: {doc['url']}\n{doc['text']}" for doc in retrieved_docs])
    inst = "Instruction: If you do not find the answer in the context, just say you don't know."
    rag_prompt = f"{inst}\n\nContext:\n{combined_docs}\n\nQuery: {query}\nAnswer:"
    if p:
        print(rag_prompt)
        
    payload = {"model": ollama_model_name, "prompt": rag_prompt, "stream": True}
    headers = {"Content-Type": "application/json"}

    response_text = ""
    buffer = ""

    response = requests.post(ollama_url_gen, headers=headers, data=json.dumps(payload), stream=True)

    # Process the response content as it arrives
    if response.status_code == 200:
        for chunk in response.iter_content(chunk_size=None):
            try:
                data = json.loads(chunk.decode('utf-8'))
                content = data.get("response", "")
                buffer += content

                # Display output every few characters for real-time effect
                if len(buffer) > 10:
                    response_text += buffer
                    clear_output(wait=True)
                    display(Markdown(response_text))
                    buffer = ""
                    
            except json.JSONDecodeError:
                continue

        # Display any remaining buffered content
        response_text += buffer
        clear_output(wait=True)
        display(Markdown(response_text))
    else:
        print("Request failed:", response.status_code, response.text)

    return response_text

try:
    store_in_qdrant_with_metadata(rag_chunks)
    print(f'Stored {len(rag_chunks)} chunks')
except Exception as e:
    print(f"Error storing in Qdrant: {e}")


Generating embeddings:   0%|          | 0/8 [00:00<?, ?it/s]

Stored 8 chunks


In [3]:
_ = ask("What are Hissain's special interests?")

Based on the provided context, it appears that Hissain is an AI enthusiast and has worked as a software architect. Additionally, he has shown interest in developing projects related to CoronaTracker, which suggests that his specialty may be in health-related applications or disease tracking systems.

His LinkedIn profile also mentions that he works at Samsung Electronics in Dhaka, Bangladesh, which could indicate that he is interested in technology and innovation, particularly in the field of telecommunications.

In [6]:
_ = ask("Tell me about CoronaTracker")

CoronaTracker is an open-source project aimed at creating an application to track close contacts within the last 14 days for a COVID-19 positive patient. The goal of this project is to help flatten the curve of infection by early detection and collection of close contact information, thereby aiding in the fight against the COVID-19 pandemic.

In [36]:
_ = ask("Anything about astronomy, philosophy?")

I don't know anything about astronomy or philosophy from the provided context. The texts appear to be related to a CoronaTracker project for COVID-19, machine learning, and sensor data analysis.

In [39]:
ollama_url_chat = "http://localhost:11434/api/chat"

chat_history = []

session = requests.Session()
session.headers.update({"Connection": "keep-alive", "Content-Type": "application/json"})

def chat(query, k=2, p=False, stream=True):
    global chat_history
    
    query_embedding = get_embedding(query)
    retrieved_docs = search_points_with_metadata(query_embedding, k)
    
    combined_docs = "\n\n".join([f"Source: {doc['url']}\n{doc['text']}" for doc in retrieved_docs])
    inst = "Instruction: If you do not find the answer in the context, just say you don't know."
    rag_prompt = f"{inst}\n\nContext:\n{combined_docs}\n\nQuery: {query}\nAnswer:"
    if p:
        print(rag_prompt)
        
    chat_history.append({"role": "user", "content": rag_prompt})
    payload = {"model": ollama_model_name, "messages": chat_history, "stream": stream}
    headers = {"Connection": "keep-alive", "Content-Type": "application/json"}

    response_text = ""
    buffer = ""

    response = session.post(ollama_url_chat, data=json.dumps(payload), stream=stream)

    # Process the response content as it arrives
    if response.status_code == 200:
        for chunk in response.iter_content(chunk_size=None):
            try:
                data = json.loads(chunk.decode('utf-8'))
                content = data.get("message", {}).get("content", "")
                buffer += content

                # Display output every few characters for real-time effect
                if len(buffer) > 10:
                    response_text += buffer
                    clear_output(wait=True)
                    display(Markdown(response_text))
                    buffer = ""
                    
            except json.JSONDecodeError:
                continue

        # Display any remaining buffered content
        response_text += buffer
        clear_output(wait=True)
        display(Markdown(response_text))
    else:
        print("Request failed:", response.status_code, response.text)

    return response_text

In [34]:
_ = chat("Who is Hissain?")

Hissain is an associate software architect, innovator, and AI enthusiast. He works at Samsung Electronics in Dhaka, Bangladesh, and has 7 followers on GitHub.

In [35]:
_ = chat("Whats his experience?")

He has over 13 years of experience in mobile and wearables software development, with a strong background in:

- OOP
- Android (Java, Kotlin)
- iOS (Swift, Objective-C)
- Xcode
- Version control systems
- System design
- Application architecture
- Development processes
- Wearable and hearable technology

Specifically, he has worked on Samsung Health for iOS, which boasts over 6 million users and approximately 160k DAU on the app store market. He has also achieved six patent applications granted by Samsung, with one already published in USPTO, Wipo, and KR.

Additionally, he is currently engaged in Samsung Earbuds device development on RTOS, specifically in music streaming over BT Classic and LE Audio, as well as in ballistocardiogram signal processing for stress score generation.

In [40]:
_ = chat("Whats his topic of special interest?")

His topic of special interest is sensor data and signal processing in Machine Learning.

In [48]:
_ = ask("Do my special interests include technological innovation, human-machine interaction, information theory, astronomy, probability, theory of relativity, philosophy of science, piano, guitar, and poetry?", p=True)

No, I don't know if any of those topics are mentioned in the provided context as part of your special interests. Your special interests include machine learning (specifically sensor data and signal processing).