In [6]:
# Imports
import time
import faiss
import json
import requests
import numpy as np
import json
import asyncio
import aiohttp
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin
from tqdm.notebook import tqdm
from collections import deque

# Constants
root = 'https://en.wikipedia.org/wiki/Main_Page'
base_url = 'https://en.wikipedia.org/wiki/Main_Page'

ollama_url_inf = "http://localhost:11434/api/show"
ollama_url_emb = "http://localhost:11434/api/embeddings"
ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

VERBOSE = True
DEPTH = 0

# Configure Selenium Chrome options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

service = Service()
driver = webdriver.Chrome(service=service, options=chrome_options)

stealth(driver, languages=["en-US", "en"], vendor="Google Inc.", platform="Win32", 
        webgl_vendor="Intel Inc.", renderer="Intel Iris OpenGL Engine", fix_hairline=True)

def extract_clean_text_from_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove unwanted tags
    for script in soup(["script", "style", "footer", "header", "nav"]):
        script.decompose()
    
    # Get clean text
    text = soup.get_text(separator=' ', strip=True)
    text = ' '.join(text.split())  # Remove extra whitespace
    return text

def scrape_recursive(base_url, depth=DEPTH):
    """Scrape text recursively up to a specified depth."""
    visited_urls = set()
    text_data = []
    queue = deque([(base_url, 0)])  # (url, current_depth)
    
    while queue:
        url, current_depth = queue.popleft()

        print("Processing: ", url)
        
        if url in visited_urls or current_depth > depth:
            continue
        
        if not url.startswith(root):
            continue
            
        if '#' in url:
            continue
        
        visited_urls.add(url)

        try:
            driver.get(url)
            time.sleep(1)

            page_text = extract_clean_text_from_html(driver.page_source)
            text_data.append((url, page_text))
            
            if current_depth < depth:
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                links = soup.find_all('a', href=True)
                
                for link in links:
                    new_url = urljoin(base_url, link['href'])
                    if new_url not in visited_urls and new_url.startswith(root):
                        queue.append((new_url, current_depth + 1))
        
        except Exception as e:
            if VERBOSE:
                print(f"Error with URL {url}: {e}")
    
    return text_data

# Run scraping and clean up driver
try:
    scraped_data = scrape_recursive(base_url)
finally:
    driver.quit()
    print("Finished!")

print("Total text size: ", len(scraped_data))

for url, text in scraped_data:
    print(f"URL: {url}\nText: {text[:200]}...\n")  # Print a snippet of the text

Processing:  https://en.wikipedia.org/wiki/Main_Page
Finished!
Total text size:  1
URL: https://en.wikipedia.org/wiki/Main_Page
Text: Wikipedia, the free encyclopedia Jump to content Wiki Loves Monuments: Photograph a monument, help Wikipedia and win! Learn more From Wikipedia, the free encyclopedia Welcome to Wikipedia , the free e...



In [7]:
import json
import requests
import numpy as np
import faiss
import asyncio
import aiohttp
from tqdm.notebook import tqdm


# Function to partition text efficiently
def partition_text(text, max_length):
    sentences = text.split('. ')
    partitions = []
    current_part = []
    current_length = 0

    for sentence in tqdm(sentences, desc="Partitioning text"):
        sentence_length = len(sentence.split())
        
        if current_length + sentence_length > max_length:
            partitions.append('. '.join(current_part))
            current_part = []
            current_length = 0
        
        current_part.append(sentence)
        current_length += sentence_length
    
    if current_part:
        partitions.append('. '.join(current_part))

    return partitions


# Asynchronous function to fetch model embedding shape
async def get_embedding_shape():
    async with aiohttp.ClientSession() as session:
        payload = {"model": ollama_model_name}
        headers = {"Content-Type": "application/json"}
        async with session.post(ollama_url_inf, headers=headers, data=json.dumps(payload)) as response:
            if response.status == 200:
                result = await response.json()
                if 'model_info' in result and 'llama.embedding_length' in result['model_info']:
                    return result['model_info']["llama.embedding_length"]
                else:
                    print("ERROR: Embedding length not found in model info.")
                    return 0
            else:
                print(f"ERROR: Error from Ollama: {response.status}")
                return 0

# Asynchronous function to fetch embeddings for a batch of text partitions
async def get_embeddings(partitions):
    embeddings = []
    async with aiohttp.ClientSession() as session:
        for partition in tqdm(partitions, desc="Fetching embeddings"):
            payload = {"model": ollama_model_name, "prompt": partition}
            headers = {"Content-Type": "application/json"}
            async with session.post(ollama_url_emb, headers=headers, data=json.dumps(payload)) as response:
                if response.status == 200:
                    result = await response.json()
                    embeddings.append(np.array(result.get('embedding', np.zeros(768))))  # Adjust based on model
                else:
                    print(f"ERROR: Error from Ollama: {response.status}")
                    embeddings.append(np.zeros(768))
    return embeddings

# Store embeddings in FAISS with optimized bulk addition
async def store_in_faiss(partitions):
    dimension = await get_embedding_shape()
    if VERBOSE:
        print(f"Model's embedding dimension: {dimension}")
    index = faiss.IndexFlatL2(dimension)
    
    embeddings = await get_embeddings(partitions)
    if embeddings:
        index.add(np.array(embeddings))  # Bulk add for efficiency
    return index, list(range(len(partitions)))

# Retrieve function with RAG applied to FAISS
async def retrieve_with_rag(query, faiss_index, partitions, k=5):
    query_embedding = (await get_embeddings([query]))[0]
    distances, indices = faiss_index.search(np.array([query_embedding]), k=k)

    retrieved_docs = []
    for i in indices[0]:
        if i < len(partitions):
            retrieved_docs.append(partitions[i])

    combined_docs = "\n".join(retrieved_docs)
    rag_prompt = f"Instruction: If you do not find the exact answer in the context, simply say you dont know.\nContext:\n{combined_docs}\n\nQuery: {query}\nAnswer:"
    
    async with aiohttp.ClientSession() as session:
        payload = {"model": ollama_model_name, "prompt": rag_prompt, "stream": False}
        headers = {"Content-Type": "application/json"}
        async with session.post(ollama_url_gen, headers=headers, data=json.dumps(payload)) as response:
            return await response.json()

# Function to initiate retrieval with RAG
async def ask(query):
    rag_response = await retrieve_with_rag(query, faiss_index, partitions)
    return rag_response.get("response", "No response available")

# Assuming you have `scraped_data` initialized as a list of text data
all_text = scraped_data[0][1]
partitions = partition_text(all_text, max_length=512)

if VERBOSE:
    print(f'Total partition count: {len(partitions)}')

faiss_index, doc_ids = await store_in_faiss(partitions)

Partitioning text:   0%|          | 0/54 [00:00<?, ?it/s]

Total partition count: 3
Model's embedding dimension: 3072


Fetching embeddings:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
print(f'all_text len: {len(all_text)}')

all_text len: 7884


In [9]:
ans = await ask("Whats the page about?")
print(ans)

Fetching embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

The page you're referring to is the Wikipedia Main Page.


In [17]:
ollama_url_chat = "http://localhost:11434/api/chat"

async def chat(prompt):
    async with aiohttp.ClientSession() as session:
        messages = {'role':'user', 'content':prompt}
        payload = {"model": ollama_model_name, "messages": [messages], "stream": False}
        headers = {"Content-Type": "application/json"}
        async with session.post(ollama_url_chat, headers=headers, data=json.dumps(payload)) as response:
            return await response.json()

In [18]:
res = await chat("What is wikipedia?")

In [19]:
res

{'model': 'llama3.2:latest',
 'created_at': '2024-10-26T14:53:19.204098Z',
 'message': {'role': 'assistant',
  'content': "Wikipedia is a free online encyclopedia that allows anyone with an internet connection to access and contribute to its vast collection of articles on various subjects. It was founded in 2001 by Jimmy Wales and Larry Sanger, and it has since become one of the most popular websites in the world.\n\nWikipedia's content is edited and maintained by volunteers from around the globe, who contribute their knowledge and expertise to create and improve articles. The site uses a collaborative model, where anyone can register and start editing articles, but also relies on a system of checks and balances to ensure that information is accurate and reliable.\n\nHere are some key features of Wikipedia:\n\n1. **Free access**: Anyone with an internet connection can access Wikipedia without paying any fees or subscription.\n2. **Collaborative model**: Anyone can contribute to and edi

In [23]:
import aiohttp
import json
import asyncio

ollama_url_chat = "http://localhost:11434/api/chat"
ollama_model_name = "llama3.2:latest"

chat_history = []

async def chat(prompt, stream=True):
    global chat_history
    chat_history.append({"role": "user", "content": prompt})
    payload = {"model": ollama_model_name, "messages": chat_history, "stream": stream}
    headers = {"Content-Type": "application/json"}
    
    response_text = ""

    async with aiohttp.ClientSession() as session:
        async with session.post(ollama_url_chat, headers=headers, data=json.dumps(payload)) as response:
            if stream:
                async for chunk in response.content.iter_any():
                    try:
                        data = json.loads(chunk.decode('utf-8'))
                        content = data.get("message", {}).get("content", "")
                        response_text += content
                        print(content, end='')
                    except json.JSONDecodeError:
                        continue
            else:
                response_text = await response.text()
                print(response_text)
    
    chat_history.append({"role": "assistant", "content": response_text})
    
    return response_text

await chat("What is Wikipedia?")


Wikipedia is a free online encyclopedia that allows users to access and contribute to a vast repository of knowledge on various subjects. It was launched in January 2001 by Jimmy Wales and Larry Sanger, and it has since become one of the most popular and widely-used reference sources on the internet.

Wikipedia's core principle is based on the concept of collaborative editing, where anyone with an internet connection can create an account and contribute to articles by adding or modifying content. The platform relies on a community-driven approach, where editors work together to review, verify, and improve the accuracy and quality of the information presented.

Here are some key features that define Wikipedia:

1. **Free and open**: Wikipedia is free to access and use, with no subscription or payment required.
2. **Collaborative editing**: Anyone can create an account and contribute to articles, but this requires adherence to a set of guidelines and rules.
3. **Encyclopedic content**: A

"Wikipedia is a free online encyclopedia that allows users to access and contribute to a vast repository of knowledge on various subjects. It was launched in January 2001 by Jimmy Wales and Larry Sanger, and it has since become one of the most popular and widely-used reference sources on the internet.\n\nWikipedia's core principle is based on the concept of collaborative editing, where anyone with an internet connection can create an account and contribute to articles by adding or modifying content. The platform relies on a community-driven approach, where editors work together to review, verify, and improve the accuracy and quality of the information presented.\n\nHere are some key features that define Wikipedia:\n\n1. **Free and open**: Wikipedia is free to access and use, with no subscription or payment required.\n2. **Collaborative editing**: Anyone can create an account and contribute to articles, but this requires adherence to a set of guidelines and rules.\n3. **Encyclopedic con

In [24]:
await chat("What is it's vision?")

Wikipedia's vision is to create a free online encyclopedia that provides accurate and reliable information on any topic in the world, accessible to anyone with an internet connection.

Specifically, Wikipedia aims to:

1. **Make knowledge universally accessible**: Provide free access to high-quality educational resources for everyone, regardless of their geographical location or socio-economic background.
2. **Foster a collaborative community**: Encourage and support a global community of editors, administrators, and volunteers who work together to create, review, and improve the content.
3. **Provide accurate and reliable information**: Strive for accuracy, neutrality, and objectivity in all articles, ensuring that users receive trustworthy information on various subjects.

Wikipedia's vision is guided by its core values:

1. **Free access**: Make knowledge freely available to anyone with an internet connection.
2. **Collaborative editing**: Encourage and support a global community of

"Wikipedia's vision is to create a free online encyclopedia that provides accurate and reliable information on any topic in the world, accessible to anyone with an internet connection.\n\nSpecifically, Wikipedia aims to:\n\n1. **Make knowledge universally accessible**: Provide free access to high-quality educational resources for everyone, regardless of their geographical location or socio-economic background.\n2. **Foster a collaborative community**: Encourage and support a global community of editors, administrators, and volunteers who work together to create, review, and improve the content.\n3. **Provide accurate and reliable information**: Strive for accuracy, neutrality, and objectivity in all articles, ensuring that users receive trustworthy information on various subjects.\n\nWikipedia's vision is guided by its core values:\n\n1. **Free access**: Make knowledge freely available to anyone with an internet connection.\n2. **Collaborative editing**: Encourage and support a global 