In [1]:
import time
import faiss
import json
import requests
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from urllib.parse import urljoin
from tqdm.notebook import tqdm

ollama_url_inf = "http://localhost:11434/api/show"
ollama_url_emb = "http://localhost:11434/api/embeddings"
ollama_url_gen = "http://localhost:11434/api/generate"
ollama_model_name = "llama3.2:latest"

base_url = 'https://en.wikipedia.org/wiki/Bangladesh'

VERBOSE = False

# Configure Selenium Chrome options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

service = Service()
driver = webdriver.Chrome(service=service, options=chrome_options)

def get_visitable_links(base_url):
    driver.get(base_url)
    time.sleep(1)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    links = []
    for a_tag in soup.find_all('a', href=True):
        full_url = urljoin(base_url, a_tag['href'])
        if '#' not in full_url and '%' not in full_url and full_url.startswith(base_url) and full_url not in links:
            links.append(full_url)
    return links

def scrape_text_from_url(url):
    driver.get(url)
    time.sleep(1)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    text_content = ' '.join([p.get_text() for p in soup.find_all('p')])
    text_content = ' '.join(text_content.split())
    return text_content

def partition_text(text, max_length):
    sentences = text.split('. ')
    partitions = []
    current_part = []
    current_length = 0
    
    for sentence in tqdm(sentences, desc="Partitioning text"):
        current_length += len(sentence.split())
        current_part.append(sentence)
        
        if current_length > max_length:
            partitions.append('. '.join(current_part))
            current_part = []
            current_length = 0

    if current_part:
        partitions.append('. '.join(current_part))

    return partitions


def get_embedding_shape():
    payload = { "model": ollama_model_name }
    headers = {"Content-Type": "application/json"}
    response = requests.post(ollama_url_inf, headers=headers, data=json.dumps(payload))

    if response.status_code == 200:
        result = response.json()
        if 'model_info' in result and 'llama.embedding_length' in result['model_info']:
            embedding_length = result['model_info']["llama.embedding_length"]
            return embedding_length
        else:
            print("ERROR: Embedding length not found in the model info.")
            return 0
    else:
        print(f"ERROR: Error from Ollama: {response.status_code}")
        return 0

def get_embedding(text):
    payload = { "model": ollama_model_name, "prompt": text}
    headers = {"Content-Type": "application/json"}
    response = requests.post(ollama_url_emb, headers=headers, data=json.dumps(payload))

    if response.status_code == 200:
        result = response.json()
        embedding = np.array(result['embedding'])
        return embedding
    else:
        print(f"ERROR: Error from Ollama: {response.status_code}")
        return np.zeros(768)  # (adjust dimension based on model)

def store_in_faiss(partitions):
    dimension = get_embedding_shape()
    if VERBOSE:
        print(f"Models embedding dimension: {dimension}")
    index = faiss.IndexFlatL2(dimension)  # L2 distance

    doc_vectors = []
    doc_ids = []
    
    for i, partition in tqdm(enumerate(partitions), total=len(partitions), desc="Embedding partitions"):
        embedding = get_embedding(partition)
        index.add(np.array([embedding]))
        doc_vectors.append(embedding)
        doc_ids.append(i)
    
    return index, doc_ids

def retrieve_with_rag(query, faiss_index, doc_ids, k=2):
    query_embedding = get_embedding(query)
    distances, indices = faiss_index.search(np.array([query_embedding]), k=k)  # Retrieve top-1 closest documents
    retrieved_docs = []
    for i in tqdm(indices[0], desc="Retrieving documents"):
        if i >= len(partitions):
            print(f"WARN: Index {i} out of bounds for partition list.")
            continue
        doc_id = doc_ids[i]
        retrieved_docs.append(partitions[doc_id])
    combined_docs = "\n".join(retrieved_docs)
    rag_prompt = f"Context:\n{combined_docs}\n\nQuery: {query}\nAnswer:"
    payload = {"model": ollama_model_name, "prompt": rag_prompt, "stream": False}
    response = requests.post(ollama_url_gen, headers={"Content-Type": "application/json"}, 
                             data=json.dumps(payload))
    return response.json()

def ask(query):
    rag_response = retrieve_with_rag(query, faiss_index, doc_ids)
    return rag_response["response"]

valid_links = get_visitable_links(base_url)[1:4]

if VERBOSE:
    print(f"Found {len(valid_links)} valid links. Scraping the content...")

all_text = ""
for link in tqdm(valid_links, desc="Scraping items"):
    text_content = scrape_text_from_url(link)
    all_text += text_content + "\n\n"

partitions = partition_text(all_text, max_length=512)

if VERBOSE:
    print(f'Total partition count: {len(partitions)}')

# Store the partitions in FAISS
faiss_index, doc_ids = store_in_faiss(partitions)

Scraping items:   0%|          | 0/3 [00:00<?, ?it/s]

Partitioning text:   0%|          | 0/95 [00:00<?, ?it/s]

Embedding partitions:   0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
ask("Where is Bangladesh located?")

Retrieving documents:   0%|          | 0/2 [00:00<?, ?it/s]

'Bangladesh is located in South Asia, centered on the transnational historical region of Bengal along the eponymous bay.'

In [4]:
ask("Summarize the culture of Bangladesh")

# Close the Selenium browser
#driver.quit()

Retrieving documents:   0%|          | 0/2 [00:00<?, ?it/s]

"Bangladesh is known for its cultural pluralism, with a Bengali Muslim majority that coexists with various ethnic groups. The country has a strong tradition of social hierarchy, with patrilineal kin ties and extended family structures playing important roles in village life. However, modernization and urbanization have led to changes in traditional sources of prestige, such as landholding and religious piety, being replaced by education, income, and steady work.\n\nThe country's cultural identity is deeply rooted in the Bengali language and its rich traditions, with a strong emphasis on secularism and community. The traditional village structure consists of extended family units, social associations, and councils that settle disputes and maintain social order.\n\nIn recent years, there has been an increasing shift towards urbanization, with 34% of Bangladeshis living in cities in 2015. This has led to the growth of urban centers and the development of new forms of community and identit